From d4f65b5d2497b2fd9c45f06b71deb4ab084a5b66 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Sep 2012 13:06:29 +0100 Subject: KEYS: Add payload preparsing opportunity prior to key instantiate or update Give the key type the opportunity to preparse the payload prior to the instantiation and update routines being called. This is done with the provision of two new key type operations: int (*preparse)(struct key_preparsed_payload *prep); void (*free_preparse)(struct key_preparsed_payload *prep); If the first operation is present, then it is called before key creation (in the add/update case) or before the key semaphore is taken (in the update and instantiate cases). The second operation is called to clean up if the first was called. preparse() is given the opportunity to fill in the following structure: struct key_preparsed_payload { char *description; void *type_data[2]; void *payload; const void *data; size_t datalen; size_t quotalen; }; Before the preparser is called, the first three fields will have been cleared, the payload pointer and size will be stored in data and datalen and the default quota size from the key_type struct will be stored into quotalen. The preparser may parse the payload in any way it likes and may store data in the type_data[] and payload fields for use by the instantiate() and update() ops. The preparser may also propose a description for the key by attaching it as a string to the description field. This can be used by passing a NULL or "" description to the add_key() system call or the key_create_or_update() function. This cannot work with request_key() as that required the description to tell the upcall about the key to be created. This, for example permits keys that store PGP public keys to generate their own name from the user ID and public key fingerprint in the key. The instantiate() and update() operations are then modified to look like this: int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); int (*update)(struct key *key, struct key_preparsed_payload *prep); and the new payload data is passed in *prep, whether or not it was preparsed. Signed-off-by: David Howells --- fs/cifs/cifs_spnego.c | 6 +++--- fs/cifs/cifsacl.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index e622863b292f..086f381d6489 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -31,18 +31,18 @@ /* create a new cifs key */ static int -cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) +cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; int ret; ret = -ENOMEM; - payload = kmalloc(datalen, GFP_KERNEL); + payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) goto error; /* attach the data */ - memcpy(payload, data, datalen); + memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; ret = 0; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 05f4dc263a23..f3c60e264ca8 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = { }; static int -cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) +cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; - payload = kmalloc(datalen, GFP_KERNEL); + payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; - memcpy(payload, data, datalen); + memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; - key->datalen = datalen; + key->datalen = prep->datalen; return 0; } -- cgit v1.2.1 From f8aa23a55f813c9bddec2a6176e0e67274e6e7c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 19:24:56 +0100 Subject: KEYS: Use keyring_alloc() to create special keyrings Use keyring_alloc() to create special keyrings now that it has a permissions parameter rather than using key_alloc() + key_instantiate_and_link(). Also document and export keyring_alloc() so that modules can use it too. Signed-off-by: David Howells --- fs/cifs/cifsacl.c | 12 ++++-------- fs/nfs/idmap.c | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 05f4dc263a23..a8a753c8fcd5 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -537,19 +537,15 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&cifs_idmap_key_type); if (ret < 0) goto failed_put_key; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a850079467d8..957134b4c0fd 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -192,19 +192,15 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; -- cgit v1.2.1 From 33c7a2bc48a81fa714572f8ce29f29bc17e6faf0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:55:59 +1100 Subject: xfs: xfs_syncd_stop must die xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated functionailty together that actually have different start and stop requirements. Kill these functions and open code the start/stop methods for each of the background functions. Subsequent patches will move the start/stop functions around to the correct places to avoid races and shutdown issues. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 25 ++++++++++++++++++------- fs/xfs/xfs_sync.c | 30 ++++-------------------------- fs/xfs/xfs_sync.h | 6 ++++-- 3 files changed, 26 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 26a09bd7f975..37d1bbce047d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1008,7 +1008,11 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1384,9 +1388,11 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); error = xfs_mountfs(mp); if (error) @@ -1409,8 +1415,10 @@ xfs_fs_fill_super( return 0; out_syncd_stop: - xfs_syncd_stop(mp); - out_filestream_unmount: + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1429,7 +1437,10 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 9500caf15acf..7502f0621fb9 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -370,7 +370,7 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -static void +void xfs_syncd_queue_sync( struct xfs_mount *mp) { @@ -383,7 +383,7 @@ xfs_syncd_queue_sync( * disk quotas. We might need to cover the log to indicate that the * filesystem is idle and not frozen. */ -STATIC void +void xfs_sync_worker( struct work_struct *work) { @@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim( * goes low. It scans as quickly as possible avoiding locked inodes or those * already being flushed, and once done schedules a future pass. */ -STATIC void +void xfs_reclaim_worker( struct work_struct *work) { @@ -478,7 +478,7 @@ xfs_flush_inodes( flush_work(&mp->m_flush_work); } -STATIC void +void xfs_flush_worker( struct work_struct *work) { @@ -489,28 +489,6 @@ xfs_flush_worker( xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); } -int -xfs_syncd_init( - struct xfs_mount *mp) -{ - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - - return 0; -} - -void -xfs_syncd_stop( - struct xfs_mount *mp) -{ - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..3f59e5bed66b 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,10 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -int xfs_syncd_init(struct xfs_mount *mp); -void xfs_syncd_stop(struct xfs_mount *mp); +void xfs_syncd_queue_sync(struct xfs_mount *mp); +void xfs_sync_worker(struct work_struct *work); +void xfs_flush_worker(struct work_struct *work); +void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -- cgit v1.2.1 From 7e18530bef6a18a5479690ae7e8256319ecf1300 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:00 +1100 Subject: xfs: rationalise xfs_mount_wq users Instead of starting and stopping background work on the xfs_mount_wq all at the same time, separate them to where they really are needed to start and stop. The xfs_sync_worker, only needs to be started after all the mount processing has completed successfully, while it needs to be stopped before the log is unmounted. The xfs_reclaim_worker is started on demand, and can be stopped before the unmount process does it's own inode reclaim pass. The xfs_flush_inodes work is run on demand, and so we really only need to ensure that it has stopped running before we start processing an unmount, freeze or remount,ro. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 6 ++++-- fs/xfs/xfs_super.c | 34 ++++++++++++++-------------------- fs/xfs/xfs_sync.c | 21 +++++---------------- 3 files changed, 23 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b2bd3a0e6376..d9a31c6a0c53 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1450,9 +1450,11 @@ xfs_unmountfs( /* * And reclaim all inodes. At this point there should be no dirty - * inode, and none should be pinned or locked, but use synchronous - * reclaim just to be sure. + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ + cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37d1bbce047d..9805cac81fc9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,14 +1005,12 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_filestream_unmount(mp); - cancel_delayed_work_sync(&mp->m_sync_work); - xfs_unmountfs(mp); - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1325,6 +1323,9 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; sb->s_fs_info = mp; @@ -1388,15 +1389,9 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { @@ -1413,12 +1408,15 @@ xfs_fs_fill_super( goto out_unmount; } + /* + * The filesystem is successfully mounted, so we can start background + * sync work now. + */ + xfs_syncd_queue_sync(mp); + return 0; - out_syncd_stop: - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); + out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1437,10 +1435,6 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 7502f0621fb9..a68761696ab5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -379,9 +379,9 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items, reclaim inodes and sync - * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle and not frozen. + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +391,7 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - /* - * We shouldn't write/force the log if we are in the mount/unmount - * process or on a read only filesystem. The workqueue still needs to be - * active in both cases, however, because it is used for inode reclaim - * during these times. Use the MS_ACTIVE flag to avoid doing anything - * during mount. Doing work during unmount is avoided by calling - * cancel_delayed_work_sync on this work queue before tearing down - * the ail and the log in xfs_log_unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) @@ -409,8 +399,7 @@ xfs_sync_worker( else xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently - * dirty */ + /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); } -- cgit v1.2.1 From 7f7bebefba152c5bdfe961cd2e97e8695a32998c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:01 +1100 Subject: xfs: don't run the sync work if the filesystem is read-only If the filesystem is mounted or remounted read-only, stop the sync worker that tries to flush or cover the log if the filesystem is dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw as necessary. This avoids the need for RO checks in the work. Similarly, stop the sync work when the filesystem is frozen, and start it again when the filesysetm is thawed. This avoids the need for special freeze checks in the work. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 ++ fs/xfs/xfs_sync.c | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9805cac81fc9..20fa955d80d1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1200,6 +1200,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); } /* rw -> ro */ @@ -1245,6 +1246,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); return 0; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index a68761696ab5..e898d1807044 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -323,6 +323,9 @@ xfs_quiesce_data( * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. + * + * Note: this stops background sync work - the callers must ensure it is started + * again when appropriate. */ void xfs_quiesce_attr( @@ -341,6 +344,9 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); + /* stop background sync work */ + cancel_delayed_work_sync(&mp->m_sync_work); + /* * Just warn here till VFS can correctly support * read-only remount without racing. @@ -379,9 +385,8 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items in the AIL and push them to - * disk. If there is nothing dirty, then we might need to cover the log to - * indicate that the filesystem is idle and not frozen. + * Every sync period we need to push dirty metadata and try to cover the log + * to indicate the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +396,15 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); + /* dgc: errors ignored here */ + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - } + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); /* queue us up again */ xfs_syncd_queue_sync(mp); -- cgit v1.2.1 From f661f1e0bf5002bdcc8b5810ad0a184a1841537f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:02 +1100 Subject: xfs: sync work is now only periodic log work The only thing the periodic sync work does now is flush the AIL and idle the log. These are really functions of the log code, so move the work to xfs_log.c and rename it appropriately. The only wart that this leaves behind is the xfssyncd_centisecs sysctl, otherwise the xfssyncd is dead. Clean up any comments that related to xfssyncd to reflect it's passing. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 61 ++++++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log.h | 3 +++ fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 16 ++++---------- fs/xfs/xfs_sync.c | 39 +++----------------------------- fs/xfs/xfs_sync.h | 2 -- 7 files changed, 62 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e7..efea12bfbd6b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" kmem_zone_t *xfs_log_ticket_zone; @@ -679,25 +680,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -858,7 +863,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -1161,6 +1166,40 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, @@ -1195,6 +1234,7 @@ xlog_alloc_log( log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -3700,3 +3740,4 @@ xlog_iclogs_empty( } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 748d312850e2..26ed7de352d7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,8 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); + #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 18a801d76a42..9a4e0e5ec322 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -495,6 +495,7 @@ struct xlog { struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index deee09e534dc..26e46aeaa3f1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -197,7 +197,6 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct delayed_work m_sync_work; /* background sync work */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 20fa955d80d1..37c39a155a58 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,7 +1005,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_delayed_work_sync(&mp->m_sync_work); cancel_work_sync(&mp->m_flush_work); xfs_filestream_unmount(mp); @@ -1040,10 +1039,10 @@ xfs_fs_sync_fs( if (laptop_mode) { /* * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is + * We schedule log work now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work(&mp->m_sync_work); + flush_delayed_work(&mp->m_log->l_work); } return 0; @@ -1200,7 +1199,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); } /* rw -> ro */ @@ -1246,7 +1245,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); return 0; } @@ -1326,7 +1325,6 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; @@ -1410,12 +1408,6 @@ xfs_fs_fill_super( goto out_unmount; } - /* - * The filesystem is successfully mounted, so we can start background - * sync work now. - */ - xfs_syncd_queue_sync(mp); - return 0; out_filestream_unmount: diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index e898d1807044..2174555aebb2 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -19,6 +19,7 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -344,8 +345,8 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); - /* stop background sync work */ - cancel_delayed_work_sync(&mp->m_sync_work); + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); /* * Just warn here till VFS can correctly support @@ -376,40 +377,6 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -void -xfs_syncd_queue_sync( - struct xfs_mount *mp) -{ - queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, - msecs_to_jiffies(xfs_syncd_centisecs * 10)); -} - -/* - * Every sync period we need to push dirty metadata and try to cover the log - * to indicate the filesystem is idle and not frozen. - */ -void -xfs_sync_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_sync_work); - int error; - - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - - /* queue us up again */ - xfs_syncd_queue_sync(mp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 3f59e5bed66b..8d58fab72a10 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,6 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_syncd_queue_sync(struct xfs_mount *mp); -void xfs_sync_worker(struct work_struct *work); void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); -- cgit v1.2.1 From cf2931db2d189ce0583be7ae880d7e3f8c15f623 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:03 +1100 Subject: xfs: Bring some sanity to log unmounting When unmounting the filesystem, there are lots of operations that need to be done in a specific order, and they are spread across across a couple of functions. We have to drain the AIL before we write the unmount record, and we have to shut down the background log work before we do either of them. But this is all split haphazardly across xfs_unmountfs() and xfs_log_unmount(). Move all the AIL flushing and log manipulations to xfs_log_unmount() so that the responisbilities of each function is clear and the operations they perform obvious. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 29 ++++++++++++++++++++++++++--- fs/xfs/xfs_mount.c | 24 ------------------------ 2 files changed, 26 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index efea12bfbd6b..e788f39721e3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,15 +855,38 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Shut down and release the AIL and Log. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record, tear down the AIL and finally free the log. */ void xfs_log_unmount(xfs_mount_t *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d9a31c6a0c53..c195ec85c725 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1459,13 +1459,6 @@ xfs_unmountfs( xfs_qm_unmount(mp); - /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1491,23 +1484,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); -- cgit v1.2.1 From 9aa05000f2b7cab4be582afba64af10b2d74727e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:04 +1100 Subject: xfs: xfs_sync_data is redundant. We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 13 +++++---- fs/xfs/xfs_iomap.c | 23 +++++---------- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 21 ++++++++++++-- fs/xfs/xfs_super.h | 1 + fs/xfs/xfs_sync.c | 78 --------------------------------------------------- fs/xfs/xfs_sync.h | 3 -- fs/xfs/xfs_vnodeops.c | 2 +- 8 files changed, 34 insertions(+), 108 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aa473fa640a2..daf4066c24b2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -728,16 +728,17 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); + pos, &iocb->ki_pos, count, 0); + /* - * if we just got an ENOSPC, flush the inode now we aren't holding any - * page locks and retry *once* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. */ if (ret == -ENOSPC && !enospc) { enospc = 1; - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + xfs_flush_inodes(ip->i_mount); + goto write_retry; } current->backing_dev_info = NULL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad935..f858b903678e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -373,7 +373,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -434,26 +434,17 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 26e46aeaa3f1..a54b5aa498d4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -198,7 +198,6 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37c39a155a58..9468c6878463 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_unwritten_workqueue); } +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1005,8 +1023,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_work_sync(&mp->m_flush_work); - xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1324,7 +1340,6 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 9de4a920ba05..bbe3d15a7904 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -74,6 +74,7 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); +extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 2174555aebb2..6a2ada379166 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -216,51 +216,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_inode_data( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct inode *inode = VFS_I(ip); - struct address_space *mapping = inode->i_mapping; - int error = 0; - - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (flags & SYNC_TRYLOCK) - return 0; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XBF_ASYNC, FI_NONE); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; -} - -/* - * Write out pagecache data for the whole filesystem. - */ -STATIC int -xfs_sync_data( - struct xfs_mount *mp, - int flags) -{ - int error; - - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); - if (error) - return XFS_ERROR(error); - - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return 0; -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -415,39 +370,6 @@ xfs_reclaim_worker( xfs_syncd_queue_reclaim(mp); } -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room. - * - * Queue a new data flush if there isn't one already in progress and - * wait for completion of the flush. This means that we only ever have one - * inode flush in progress no matter how many ENOSPC events are occurring and - * so will prevent the system from bogging down due to every concurrent - * ENOSPC event scanning all the active inodes in the system for writeback. - */ -void -xfs_flush_inodes( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - - queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work(&mp->m_flush_work); -} - -void -xfs_flush_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(work, - struct xfs_mount, m_flush_work); - - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 8d58fab72a10..0018e846f0dc 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,14 +26,11 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -void xfs_flush_inodes(struct xfs_inode *ip); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2a5c637344b4..14928564f106 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -777,7 +777,7 @@ xfs_create( XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(dp); + xfs_flush_inodes(mp); error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); } -- cgit v1.2.1 From 5889608df35783590251cfd440fa5d48f1855179 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:05 +1100 Subject: xfs: syncd workqueue is no more With the syncd functions moved to the log and/or removed, the syncd workqueue is the only remaining bit left. It is used by the log covering/ail pushing work, as well as by the inode reclaim work. Given how cheap workqueues are these days, give the log and inode reclaim work their own work queues and kill the syncd work queue. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 38 ++++++++++++++++++-------------------- fs/xfs/xfs_sync.c | 20 +++++++++----------- fs/xfs/xfs_sync.h | 2 -- 5 files changed, 30 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e788f39721e3..b6ce4d4b6def 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1193,7 +1193,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a54b5aa498d4..7c417b6b99ee 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,6 +207,8 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9468c6878463..27d5a92e1210 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -863,8 +863,23 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + return 0; +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,6 +892,8 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1391,10 +1408,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1638,16 +1651,6 @@ xfs_destroy_zones(void) STATIC int __init xfs_init_workqueues(void) { - /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of @@ -1656,20 +1659,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { destroy_workqueue(xfs_alloc_wq); - destroy_workqueue(xfs_syncd_wq); } STATIC int __init diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 6a2ada379166..15be21f074fd 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -40,8 +40,6 @@ #include #include -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -335,18 +333,18 @@ xfs_quiesce_attr( /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -367,7 +365,7 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } void @@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -646,9 +644,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0018e846f0dc..0beabea99e73 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -24,8 +24,6 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); -- cgit v1.2.1 From 34061f5c420561dd42addd252811a1fa4b0ac69b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:06 +1100 Subject: xfs: xfs_sync_fsdata is redundant Why do we need to write the superblock to disk once we've written all the data? We don't actually - the reasons for doing this are lost in the mists of time, and go back to the way Irix used to drive VFS flushing. On linux, this code is only called from two contexts: remount and .sync_fs. In the remount case, the call is followed by a metadata sync, which unpins and writes the superblock. In the sync_fs case, we only need to force the log to disk to ensure that the superblock is correctly on disk, so we don't actually need to write it. Hence the functionality is either redundant or superfluous and thus can be removed. Seeing as xfs_quiesce_data is essentially now just a log force, remove it as well and fold the code back into the two callers. Neither of them need the log covering check, either, as that is redundant for the remount case, and unnecessary for the .sync_fs case. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 19 +++++----------- fs/xfs/xfs_sync.c | 67 +++++++----------------------------------------------- 2 files changed, 14 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27d5a92e1210..b5e445a13f7b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1057,7 +1057,6 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); - int error; /* * Doing anything during the async pass would be counterproductive. @@ -1065,10 +1064,7 @@ xfs_fs_sync_fs( if (!wait) return 0; - error = xfs_quiesce_data(mp); - if (error) - return -error; - + xfs_log_force(mp, XFS_LOG_SYNC); if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1238,15 +1234,12 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { /* - * After we have synced the data but before we sync the - * metadata, we need to free up the reserve block pool so that - * the used block count in the superblock on disk is correct at - * the end of the remount. Stash the current reserve pool size - * so that if we get remounted rw, we can return it to the same - * size. + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. */ - - xfs_quiesce_data(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 15be21f074fd..581eb59a85b5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,70 +214,16 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_fsdata( - struct xfs_mount *mp) -{ - struct xfs_buf *bp; - int error; - - /* - * If the buffer is pinned then push on the log so we won't get stuck - * waiting in the write for someone, maybe ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have the - * superblock buffer locked at that point so it can become pinned in - * between there and here. - */ - bp = xfs_getsb(mp, 0); - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - return error; -} - -/* - * When remounting a filesystem read-only or freezing the filesystem, we have - * two phases to execute. This first phase is syncing the data before we - * quiesce the filesystem, and the second is flushing all the inodes out after - * we've waited for all the transactions created by the first phase to - * complete. The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - */ -/* - * First stage of freeze - no writers will make progress now we are here, - * so we flush delwri and delalloc buffers here, then wait for all I/O to - * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother emptying the AIL - * because it'll just get dirty again. - */ -int -xfs_quiesce_data( - struct xfs_mount *mp) -{ - int error, error2 = 0; - - /* force out the log */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp); - - /* mark the log as covered if needed */ - if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp); - - return error ? error : error2; -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. * + * The second phase ensures that the inodes are written to their + * location on disk rather than just existing in transactions in the log. This + * means after a quiesce there is no log replay required to write the inodes to + * disk (this is the main difference between a sync and a quiesce). + * * Note: this stops background sync work - the callers must ensure it is started * again when appropriate. */ @@ -291,6 +237,9 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + /* reclaim inodes to do any IO before the freeze completes */ xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); -- cgit v1.2.1 From c7eea6f7adca4501d2c2db7f0f7c9dc88efac95e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:07 +1100 Subject: xfs: move xfs_quiesce_attr() into xfs_super.c Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's nothing really sync-specific about this functionality so it doesn't really matter where it lives. Move it to benext to it's callers, so all the remount/sync_fs code is in the one place. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sync.c | 65 ---------------------------------------------------- fs/xfs/xfs_sync.h | 3 --- 3 files changed, 67 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b5e445a13f7b..3bafe66227fb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1148,6 +1148,73 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk (this is the main + * difference between a sync and a quiesce). + * + * This shoul deffectively mimic the code in xfs_unmountfs() and + * xfs_log_umount() but without tearing down any structures. + * XXX: bug fixes needed! + * + * Note: this stops background log work - the callers must ensure it is started + * again when appropriate. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); + + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); + + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); +} + STATIC int xfs_fs_remount( struct super_block *sb, diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 581eb59a85b5..7b630288bab5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,71 +214,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceeding. - * - * The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - * - * Note: this stops background sync work - the callers must ensure it is started - * again when appropriate. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0beabea99e73..0ba9c89c316e 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,9 +26,6 @@ struct xfs_perag; void xfs_reclaim_worker(struct work_struct *work); -int xfs_quiesce_data(struct xfs_mount *mp); -void xfs_quiesce_attr(struct xfs_mount *mp); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -- cgit v1.2.1 From c75921a72a7c4bb73a5e09a697a672722e5543f1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:08 +1100 Subject: xfs: xfs_quiesce_attr() should quiesce the log like unmount xfs_quiesce_attr() is supposed to leave the log empty with an unmount record written. Right now it does not wait for the AIL to be emptied before writing the unmount record, not does it wait for metadata IO completion, either. Fix it to use the same method and code as xfs_log_unmount(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 25 ++++++++++++++++++------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_super.c | 41 ++++++++--------------------------------- 3 files changed, 27 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b6ce4d4b6def..d2d59692739f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,20 +855,17 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Shut down and release the AIL and Log. - * - * During unmount, we need to ensure we flush all the dirty metadata objects - * from the AIL so that the log is empty before we write the unmount record to - * the log. + * Empty the log for unmount/freeze. * * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and * run the callbacks removing themselves from the AIL, we can write the unmount - * record, tear down the AIL and finally free the log. + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -886,6 +883,20 @@ xfs_log_unmount(xfs_mount_t *mp) xfs_buf_unlock(mp->m_sb_bp); xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 26ed7de352d7..5caee96059df 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -183,6 +183,7 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3bafe66227fb..fdedf2cabae3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1153,15 +1153,11 @@ xfs_restore_resvblks(struct xfs_mount *mp) * * This ensures that the metadata is written to their location on disk rather * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk (this is the main - * difference between a sync and a quiesce). + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. * - * This shoul deffectively mimic the code in xfs_unmountfs() and - * xfs_log_umount() but without tearing down any structures. - * XXX: bug fixes needed! - * - * Note: this stops background log work - the callers must ensure it is started - * again when appropriate. + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. */ void xfs_quiesce_attr( @@ -1180,39 +1176,18 @@ xfs_quiesce_attr( xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. + * Just warn here till VFS can correctly support + * read-only remount without racing. */ - xfs_ail_push_all_sync(mp->m_ail); + WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); + xfs_log_quiesce(mp); } STATIC int -- cgit v1.2.1 From 6d8b79cfca39399ef9115fb65dde85993455c9a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:09 +1100 Subject: xfs: rename xfs_sync.[ch] to xfs_icache.[ch] xfs_sync.c now only contains inode reclaim functions and inode cache iteration functions. It is not related to sync operations anymore. Rename to xfs_icache.c to reflect it's contents and prepare for consolidation with the other inode cache file that exists (xfs_iget.c). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 2 +- fs/xfs/xfs_icache.c | 715 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 43 +++ fs/xfs/xfs_iget.c | 1 + fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 2 - fs/xfs/xfs_qm_syscalls.c | 1 + fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_sync.c | 714 ---------------------------------------------- fs/xfs/xfs_sync.h | 43 --- 10 files changed, 763 insertions(+), 761 deletions(-) create mode 100644 fs/xfs/xfs_icache.c create mode 100644 fs/xfs/xfs_icache.h delete mode 100644 fs/xfs/xfs_sync.c delete mode 100644 fs/xfs/xfs_sync.h (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2f..442f256dbcac 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += xfs_aops.o \ xfs_fsops.o \ xfs_fs_subr.o \ xfs_globals.o \ + xfs_icache.o \ xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ @@ -47,7 +48,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c new file mode 100644 index 000000000000..eba216f11d5e --- /dev/null +++ b/fs/xfs/xfs_icache.c @@ -0,0 +1,715 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_icache.h" + +#include +#include + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + if (is_bad_inode(inode)) { + IRELE(ip); + return ENOENT; + } + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], pag, flags); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. + */ + if ((flags & SYNC_TRYLOCK) && + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + return 1; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, async - requeue + * dirty, sync 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. + * + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, async => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, async => requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + struct xfs_buf *bp = NULL; + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* + * Now we have an inode that needs flushing. + * + * Note that xfs_iflush will never block on the inode buffer lock, as + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. + */ + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + + xfs_iflock(ip); +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_inode_free(ip); + return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. + */ + return 0; +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + cond_resched(); + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return XFS_ERROR(last_error); +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Scan a certain number of inodes for reclaim. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +void +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) +{ + /* kick background reclaimer and push the AIL */ + xfs_reclaim_work_queue(mp); + xfs_ail_push_all(mp->m_ail); + + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} + +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; + + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h new file mode 100644 index 000000000000..0ba9c89c316e --- /dev/null +++ b/fs/xfs/xfs_icache.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +void xfs_reclaim_worker(struct work_struct *work); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); +void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, + struct xfs_inode *ip); + +int xfs_sync_inode_grab(struct xfs_inode *ip); +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), + int flags); + +#endif diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 784a803383ec..069c5ceb9459 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -38,6 +38,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c195ec85c725..6f1c997704cd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_fsops.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7c417b6b99ee..a631ca3b9065 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations { #else /* __KERNEL__ */ -#include "xfs_sync.h" - struct xlog; struct xfs_inode; struct xfs_mru_cache; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 858a3b186110..7a9071f8855f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fdedf2cabae3..3d9ea947e9f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,7 +49,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" -#include "xfs_sync.h" +#include "xfs_icache.h" #include "xfs_trace.h" #include diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c deleted file mode 100644 index 7b630288bab5..000000000000 --- a/fs/xfs/xfs_sync.c +++ /dev/null @@ -1,714 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_log_priv.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_dinode.h" -#include "xfs_error.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" -#include "xfs_inode_item.h" -#include "xfs_quota.h" -#include "xfs_trace.h" -#include "xfs_fsops.h" - -#include -#include - -/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - */ -#define XFS_LOOKUP_BATCH 32 - -STATIC int -xfs_inode_ag_walk_grab( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(rcu_read_lock_held()); - - /* - * check for stale RCU freed inode - * - * If the inode has been reallocated, it doesn't matter if it's not in - * the AG we are walking - we are walking for writeback, so if it - * passes all the "valid inode" checks and is dirty, then we'll write - * it back anyway. If it has been reallocated and still being - * initialised, the XFS_INEW check below will catch it. - */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock_noent; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock_noent; - spin_unlock(&ip->i_flags_lock); - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return EFSCORRUPTED; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - return ENOENT; - - if (is_bad_inode(inode)) { - IRELE(ip); - return ENOENT; - } - - /* inode is valid */ - return 0; - -out_unlock_noent: - spin_unlock(&ip->i_flags_lock); - return ENOENT; -} - -STATIC int -xfs_inode_ag_walk( - struct xfs_mount *mp, - struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) -{ - uint32_t first_index; - int last_error = 0; - int skipped; - int done; - int nr_found; - -restart: - done = 0; - skipped = 0; - first_index = 0; - nr_found = 0; - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int error = 0; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - if (!nr_found) { - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || xfs_inode_ag_walk_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can occur if - * we have inodes in the last block of the AG and we - * are currently pointing to the last inode. - * - * Because we may see inodes that are from the wrong AG - * due to RCU freeing and reallocation, only update the - * index if it lies in this AG. It was a race that lead - * us to see this inode, so another lookup from the - * same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = execute(batch[i], pag, flags); - IRELE(batch[i]); - if (error == EAGAIN) { - skipped++; - continue; - } - if (error && last_error != EFSCORRUPTED) - last_error = error; - } - - /* bail out if the filesystem is corrupted. */ - if (error == EFSCORRUPTED) - break; - - cond_resched(); - - } while (nr_found && !done); - - if (skipped) { - delay(1); - goto restart; - } - return last_error; -} - -int -xfs_inode_ag_iterator( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get(mp, ag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == EFSCORRUPTED) - break; - } - } - return XFS_ERROR(last_error); -} - -/* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* - * Grab the inode for reclaim exclusively. - * Return 0 if we grabbed it, non-zero otherwise. - */ -STATIC int -xfs_reclaim_inode_grab( - struct xfs_inode *ip, - int flags) -{ - ASSERT(rcu_read_lock_held()); - - /* quick check for stale RCU freed inode */ - if (!ip->i_ino) - return 1; - - /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. - */ - if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) - return 1; - - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - * - * Due to RCU lookup, we may find inodes that have been freed and only - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that - * aren't candidates for reclaim at all, so we must check the - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. - */ - spin_lock(&ip->i_flags_lock); - if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || - __xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* not a reclaim candidate. */ - spin_unlock(&ip->i_flags_lock); - return 1; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - return 0; -} - -/* - * Inodes in different states need to be treated differently. The following - * table lists the inode states and the reclaim actions necessary: - * - * inode state iflush ret required action - * --------------- ---------- --------------- - * bad - reclaim - * shutdown EIO unpin and reclaim - * clean, unpinned 0 reclaim - * stale, unpinned 0 reclaim - * clean, pinned(*) 0 requeue - * stale, pinned EAGAIN requeue - * dirty, async - requeue - * dirty, sync 0 reclaim - * - * (*) dgc: I don't think the clean, pinned state is possible but it gets - * handled anyway given the order of checks implemented. - * - * Also, because we get the flush lock first, we know that any inode that has - * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. - * - * Note that because the inode is flushed delayed write by AIL pushing, the - * flush lock may already be held here and waiting on it can result in very - * long latencies. Hence for sync reclaims, where we wait on the flush lock, - * the caller should push the AIL first before trying to reclaim inodes to - * minimise the amount of time spent waiting. For background relaim, we only - * bother to reclaim clean inodes anyway. - * - * Hence the order of actions after gaining the locks should be: - * bad => reclaim - * shutdown => unpin and reclaim - * pinned, async => requeue - * pinned, sync => unpin - * stale => reclaim - * clean => reclaim - * dirty, async => requeue - * dirty, sync => flush, wait and reclaim - */ -STATIC int -xfs_reclaim_inode( - struct xfs_inode *ip, - struct xfs_perag *pag, - int sync_mode) -{ - struct xfs_buf *bp = NULL; - int error; - -restart: - error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out; - xfs_iflock(ip); - } - - if (is_bad_inode(VFS_I(ip))) - goto reclaim; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_iunpin_wait(ip); - xfs_iflush_abort(ip, false); - goto reclaim; - } - if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - xfs_iunpin_wait(ip); - } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) - goto reclaim; - - /* - * Never flush out dirty data during non-blocking reclaim, as it would - * just contend with AIL pushing trying to do the same job. - */ - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - - /* - * Now we have an inode that needs flushing. - * - * Note that xfs_iflush will never block on the inode buffer lock, as - * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would - * result in an ABBA deadlock with xfs_ifree_cluster(). - * - * As xfs_ifree_cluser() must gather all inodes that are active in the - * cache to mark them stale, if we hit this case we don't actually want - * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error here, just unlock the - * inode, back off and try again. Hopefully the next pass through will - * see the stale flag set on the inode. - */ - error = xfs_iflush(ip, &bp); - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - - if (!error) { - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - } - - xfs_iflock(ip); -reclaim: - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - XFS_STATS_INC(xs_ig_reclaims); - /* - * Remove the inode from the per-AG radix tree. - * - * Because radix_tree_delete won't complain even if the item was never - * added to the tree assert that it's been there before to catch - * problems with the inode life time early on. - */ - spin_lock(&pag->pag_ici_lock); - if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) - ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); - spin_unlock(&pag->pag_ici_lock); - - /* - * Here we do an (almost) spurious inode lock in order to coordinate - * with inode cache radix tree lookups. This is because the lookup - * can reference the inodes in the cache without taking references. - * - * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - xfs_inode_free(ip); - return error; - -out_ifunlock: - xfs_ifunlock(ip); -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and the reclaim work never goes back to - * the idle state. Instead, return 0 to let the next scheduled - * background reclaim attempt to reclaim the inode again. - */ - return 0; -} - -/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - */ -int -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int flags, - int *nr_to_scan) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - int trylock = flags & SYNC_TRYLOCK; - int skipped; - -restart: - ag = 0; - skipped = 0; - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - if (trylock) { - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { - skipped++; - xfs_perag_put(pag); - continue; - } - first_index = pag->pag_ici_reclaim_cursor; - } else - mutex_lock(&pag->pag_ici_reclaim_lock); - - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || xfs_reclaim_inode_grab(ip, flags)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != EFSCORRUPTED) - last_error = error; - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - - cond_resched(); - - } while (nr_found && !done && *nr_to_scan > 0); - - if (trylock && !done) - pag->pag_ici_reclaim_cursor = first_index; - else - pag->pag_ici_reclaim_cursor = 0; - mutex_unlock(&pag->pag_ici_reclaim_lock); - xfs_perag_put(pag); - } - - /* - * if we skipped any AG, and we still have scan count remaining, do - * another pass this time using blocking reclaim semantics (i.e - * waiting on the reclaim locks and ignoring the reclaim cursors). This - * ensure that when we get more reclaimers than AGs we block rather - * than spin trying to execute reclaim. - */ - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { - trylock = 0; - goto restart; - } - return XFS_ERROR(last_error); -} - -int -xfs_reclaim_inodes( - xfs_mount_t *mp, - int mode) -{ - int nr_to_scan = INT_MAX; - - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); -} - -/* - * Scan a certain number of inodes for reclaim. - * - * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doing synchronous - * reclaim of inodes. That means if we come across dirty inodes, we wait for - * them to be cleaned, which we hope will not be very long due to the - * background walker having already kicked the IO off on those dirty inodes. - */ -void -xfs_reclaim_inodes_nr( - struct xfs_mount *mp, - int nr_to_scan) -{ - /* kick background reclaimer and push the AIL */ - xfs_reclaim_work_queue(mp); - xfs_ail_push_all(mp->m_ail); - - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); -} - -/* - * Return the number of reclaimable inodes in the filesystem for - * the shrinker to determine how much to reclaim. - */ -int -xfs_reclaim_inodes_count( - struct xfs_mount *mp) -{ - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - int reclaimable = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - ag = pag->pag_agno + 1; - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } - return reclaimable; -} - diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h deleted file mode 100644 index 0ba9c89c316e..000000000000 --- a/fs/xfs/xfs_sync.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef XFS_SYNC_H -#define XFS_SYNC_H 1 - -struct xfs_mount; -struct xfs_perag; - -#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ -#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ - -void xfs_reclaim_worker(struct work_struct *work); - -int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); -int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); - -void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); - -int xfs_sync_inode_grab(struct xfs_inode *ip); -int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); - -#endif -- cgit v1.2.1 From fa96acadf1eb712fca6d59922ad93787c87e44ec Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:10 +1100 Subject: xfs: move inode locking functions to xfs_inode.c xfs_ilock() and friends really aren't related to the inode cache in any way, so move them to xfs_inode.c with all the other inode related functionality. While doing this move, move the xfs_ilock() tracepoints to *before* the lock is taken so that when a hang on a lock occurs we have events to indicate which process and what inode we were trying to lock when the hang occurred. This is much better than the current silence we get on a hang... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_iget.c | 251 ----------------------------------------------------- fs/xfs/xfs_inode.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 251 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 069c5ceb9459..ea9a5fa49a48 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -453,254 +453,3 @@ out_error_or_again: return error; } -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - - if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - - trace_xfs_ilock(ip, lock_flags, _RET_IP_); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - */ -int -xfs_ilock_nowait( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) - goto out; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) - goto out; - } - if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; - } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; - } - trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - return 1; - - out_undo_iolock: - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - out: - return 0; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - - if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); - else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); - - trace_xfs_iunlock(ip, lock_flags, _RET_IP_); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote( - xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); - if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); - - trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); -} - -#ifdef DEBUG -int -xfs_isilocked( - xfs_inode_t *ip, - uint lock_flags) -{ - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); - } - - ASSERT(0); - return 0; -} -#endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa2..ba404e4b9f0c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -74,6 +74,256 @@ xfs_get_extsz_hint( return 0; } +/* + * This is a wrapper routine around the xfs_ilock() routine used to centralize + * some grungy code. It is used in places that wish to lock the inode solely + * for reading the extents. The reason these places can't just call + * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode is in b-tree + * format, then we need to lock the inode exclusively until the extents are read + * in. Locking it exclusively all the time would limit our parallelism + * unnecessarily, though. What we do instead is check to see if the extents + * have been read in yet, and only lock the inode exclusively if they have not. + * + * The function returns a value which should be given to the corresponding + * xfs_iunlock_map_shared(). This value is the mode in which the lock was + * actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer -- cgit v1.2.1 From 33479e0542df066fb0b47df18780e93bfe6e0dc5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:11 +1100 Subject: xfs: remove xfs_iget.c The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c along with the other inode cache functions. This removes all functionality from xfs_iget.c, so the file can simply be removed. This move results in various functions now only having the scope of a single file (e.g. xfs_inode_free()), so clean up all the definitions and exported prototypes in xfs_icache.[ch] and xfs_inode.h appropriately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_export.c | 1 + fs/xfs/xfs_icache.c | 421 ++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_icache.h | 6 +- fs/xfs/xfs_iget.c | 455 ----------------------------------------------- fs/xfs/xfs_inode.c | 1 + fs/xfs/xfs_inode.h | 10 +- fs/xfs/xfs_itable.c | 1 + fs/xfs/xfs_log_recover.c | 1 + fs/xfs/xfs_qm.c | 1 + fs/xfs/xfs_rtalloc.c | 1 + fs/xfs/xfs_vnodeops.c | 1 + 12 files changed, 430 insertions(+), 470 deletions(-) delete mode 100644 fs/xfs/xfs_iget.c (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 442f256dbcac..e65357bb3dc6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -40,7 +40,6 @@ xfs-y += xfs_aops.o \ xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ - xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 8c6d1d70278c..a83611849cee 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * Note that we only accept fileids which are long enough rather than allow diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eba216f11d5e..9c8703b5cd72 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -41,6 +41,421 @@ #include #include +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -253,7 +668,7 @@ xfs_reclaim_worker( xfs_reclaim_work_queue(mp); } -void +static void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, struct xfs_inode *ip) @@ -319,7 +734,7 @@ __xfs_inode_clear_reclaim( } } -void +STATIC void __xfs_inode_clear_reclaim_tag( xfs_mount_t *mp, xfs_perag_t *pag, @@ -542,7 +957,7 @@ out: * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. */ -int +STATIC int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 0ba9c89c316e..222e22f16b4a 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -24,6 +24,9 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); @@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index ea9a5fa49a48..000000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_priv.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" -#include "xfs_icache.h" - - -/* - * Allocate and initialise an xfs_inode. - */ -STATIC struct xfs_inode * -xfs_inode_alloc( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - struct xfs_inode *ip; - - /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. - */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); - if (!ip) - return NULL; - if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - ASSERT(ip->i_ino == 0); - - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - /* initialise the xfs inode */ - ip->i_ino = ino; - ip->i_mount = mp; - memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; - memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); - ip->i_flags = 0; - ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - - return ip; -} - -STATIC void -xfs_inode_free_callback( - struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct xfs_inode *ip = XFS_I(inode); - - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - - /* - * Because we use RCU freeing we need to ensure the inode always - * appears to be reclaimed with an invalid inode number when in the - * free state. The ip->i_flags_lock provides the barrier against lookup - * races. - */ - spin_lock(&ip->i_flags_lock); - ip->i_flags = XFS_IRECLAIM; - ip->i_ino = 0; - spin_unlock(&ip->i_flags_lock); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); -} - -/* - * Check the validity of the inode we just found it the cache - */ -static int -xfs_iget_cache_hit( - struct xfs_perag *pag, - struct xfs_inode *ip, - xfs_ino_t ino, - int flags, - int lock_flags) __releases(RCU) -{ - struct inode *inode = VFS_I(ip); - struct xfs_mount *mp = ip->i_mount; - int error; - - /* - * check for re-use of an inode within an RCU grace period due to the - * radix tree nodes not being updated yet. We monitor for this by - * setting the inode number to zero before freeing the inode structure. - * If the inode has been reallocated and set up, then the inode number - * will not match, so check for that, too. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - - /* - * If we are racing with another cache hit that is currently - * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete - * before continuing. - * - * XXX(hch): eventually we should do something equivalent to - * wait_on_inode to wait for these flags to be cleared - * instead of polling for it. - */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - /* - * If lookup is racing with unlink return an error immediately. - */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } - - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - error = -inode_init_always(mp->m_super, inode); - if (error) { - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - inode->i_state = I_NEW; - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - } else { - /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = EAGAIN; - goto out_error; - } - - /* We've got a live one. */ - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - trace_xfs_iget_hit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); - - return 0; - -out_error: - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return error; -} - - -static int -xfs_iget_cache_miss( - struct xfs_mount *mp, - struct xfs_perag *pag, - xfs_trans_t *tp, - xfs_ino_t ino, - struct xfs_inode **ipp, - int flags, - int lock_flags) -{ - struct xfs_inode *ip; - int error; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; - - ip = xfs_inode_alloc(mp, ino); - if (!ip) - return ENOMEM; - - error = xfs_iread(mp, tp, ip, flags); - if (error) - goto out_destroy; - - trace_xfs_iget_miss(ip); - - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_destroy; - } - - /* - * Preload the radix tree so we can insert safely under the - * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. - */ - if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; - goto out_destroy; - } - - /* - * Because the inode hasn't been added to the radix-tree yet it can't - * be found by another thread, so we can do the non-sleeping lock here. - */ - if (lock_flags) { - if (!xfs_ilock_nowait(ip, lock_flags)) - BUG(); - } - - /* - * These values must be set before inserting the inode into the radix - * tree as the moment it is inserted a concurrent lookup (allowed by the - * RCU locking mechanism) can find it and that lookup must see that this - * is an inode currently under construction (i.e. that XFS_INEW is set). - * The ip->i_flags_lock that protects the XFS_INEW flag forms the - * memory barrier that ensures this detection works correctly at lookup - * time. - */ - iflags = XFS_INEW; - if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, iflags); - - /* insert the new inode */ - spin_lock(&pag->pag_ici_lock); - error = radix_tree_insert(&pag->pag_ici_root, agino, ip); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; - goto out_preload_end; - } - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - - *ipp = ip; - return 0; - -out_preload_end: - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - if (lock_flags) - xfs_iunlock(ip, lock_flags); -out_destroy: - __destroy_inode(VFS_I(ip)); - xfs_inode_free(ip); - return error; -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; - - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ - ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); - - /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; - - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - agino = XFS_INO_TO_AGINO(mp, ino); - -again: - error = 0; - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); - - if (ip) { - error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); - if (error) - goto out_error_or_again; - } else { - rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); - - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, - flags, lock_flags); - if (error) - goto out_error_or_again; - } - xfs_perag_put(pag); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); - return 0; - -out_error_or_again: - if (error == EAGAIN) { - delay(1); - goto again; - } - xfs_perag_put(pag); - return error; -} - diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ba404e4b9f0c..bba8f37525b3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 94b32f906e79..1fc2065e010b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) + /* - * xfs_iget.c prototypes. + * xfs_inode.c prototypes. */ -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_inode_free(struct xfs_inode *ip); - -/* - * xfs_inode.c prototypes. - */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 01d10a66e302..3998fd2a7949 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_internal_inum( diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352bf..651c98859b04 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -42,6 +42,7 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xlog_find_zeroed( diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 2e86fa0cfc0d..48c750b0e830 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The global quota manager. There is only one of these for the entire diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca28a4ba4b54..a69e0b4750a9 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -38,6 +38,7 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 14928564f106..2ee1f49da0aa 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -47,6 +47,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The maximum pathlen is 1024 bytes. Since the minimum file system -- cgit v1.2.1 From d35e88faa3b0fc2cea35c3b2dca358b5cd09b45f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d2d59692739f..46b6986e39b0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2461,14 +2461,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; -- cgit v1.2.1 From a00416844b8f4b0106344bdfd90fe45a854b1d05 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + fs/xfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_ialloc.c | 1 + 3 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe3..0287f3b1b503 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707b..e1545ec2f7d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31c..c5c4ef4f2bdb 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; -- cgit v1.2.1 From 2455881c0b52f87be539c4c7deab1afff4d8a560 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b503..43f791bcd8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca1..ef7d4885dc2d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d2..91259554df8b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce16..b68c598034c1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f858b903678e..a066cf1766ab 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -575,7 +575,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; -- cgit v1.2.1 From e04426b9202bccd4cfcbc70b2fa2aeca1c86d8f5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b1..335206a9c698 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2d..feacb061bab7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8b..83d0cf3df930 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c1..5f469c3516eb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* -- cgit v1.2.1 From 386bc35a2d548c28a5083b2e162a20251b37cab5 Mon Sep 17 00:00:00 2001 From: Anna Leuschner Date: Mon, 22 Oct 2012 21:53:36 +0200 Subject: vfs: fix: don't increase bio_slab_max if krealloc() fails Without the patch, bio_slab_max, representing bio_slabs capacity, is increased before krealloc() of bio_slabs. If krealloc() fails, bio_slab_max is too high. Fix that by only updating bio_slab_max if krealloc() is successful. Signed-off-by: Anna Leuschner Signed-off-by: Jens Axboe --- fs/bio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 9298c65ad9c7..b96fc6ce4855 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -75,6 +75,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) unsigned int sz = sizeof(struct bio) + extra_size; struct kmem_cache *slab = NULL; struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; unsigned int i, entry = -1; mutex_lock(&bio_slab_lock); @@ -97,12 +98,13 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) goto out_unlock; if (bio_slab_nr == bio_slab_max && entry == -1) { - bio_slab_max <<= 1; + new_bio_slab_max = bio_slab_max << 1; new_bio_slabs = krealloc(bio_slabs, - bio_slab_max * sizeof(struct bio_slab), + new_bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); if (!new_bio_slabs) goto out_unlock; + bio_slab_max = new_bio_slab_max; bio_slabs = new_bio_slabs; } if (entry == -1) -- cgit v1.2.1 From 8fcbaa2b7f5b70dba9ed1c7f91d0a270ce752e2c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 18 Oct 2012 22:26:27 +0200 Subject: TTY: devpts, don't care about TTY in devpts_get_tty The goal is to stop setting and using tty->driver_data in devpts code. It should be used solely by the driver's code, pty in this case. First, here we remove TTY from devpts_get_tty and rename it to devpts_get_priv. Note we do not remove type safety, we just shift the [implicit] (void *) cast one layer up. index was unused in devpts_get_tty, so remove that from the prototype too. Signed-off-by: Jiri Slaby Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 14afbabe6546..47965807884d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -593,10 +593,10 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) return ret; } -struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) +void *devpts_get_priv(struct inode *pts_inode) { struct dentry *dentry; - struct tty_struct *tty; + void *priv = NULL; BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); @@ -605,13 +605,12 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) if (!dentry) return NULL; - tty = NULL; if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - tty = (struct tty_struct *)pts_inode->i_private; + priv = pts_inode->i_private; dput(dentry); - return tty; + return priv; } void devpts_pty_kill(struct tty_struct *tty) -- cgit v1.2.1 From 162b97cfa21f816f39ede1944f2a4220e3cf8969 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 18 Oct 2012 22:26:28 +0200 Subject: TTY: devpts, return created inode from devpts_pty_new The goal is to stop setting and using tty->driver_data in devpts code. It should be used solely by the driver's code, pty in this case. For the cleanup of layering, we will need the inode created in devpts_pty_new to be stored into slave's driver_data. So we convert devpts_pty_new to return the inode or an ERR_PTR-encoded error in case of failure. The move of 'inode = new_inode(sb);' from declarators to the code is only cosmetical, but it makes the code easier to read. Signed-off-by: Jiri Slaby Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 47965807884d..ec3bab716c05 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -545,7 +545,7 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_unlock(&allocated_ptys_lock); } -int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) +struct inode *devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) { /* tty layer puts index from devpts_new_index() in here */ int number = tty->index; @@ -553,19 +553,19 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct inode *inode = new_inode(sb); + struct inode *inode; struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - int ret = 0; char s[12]; /* We're supposed to be given the slave end of a pty */ BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); BUG_ON(driver->subtype != PTY_TYPE_SLAVE); + inode = new_inode(sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode->i_ino = number + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); @@ -585,12 +585,12 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) fsnotify_create(root->d_inode, dentry); } else { iput(inode); - ret = -ENOMEM; + inode = ERR_PTR(-ENOMEM); } mutex_unlock(&root->d_inode->i_mutex); - return ret; + return inode; } void *devpts_get_priv(struct inode *pts_inode) -- cgit v1.2.1 From f11afb61247016162aa92225a337c1575556c9d9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 18 Oct 2012 22:26:29 +0200 Subject: TTY: devpts, do not set driver_data The goal is to stop setting and using tty->driver_data in devpts code. It should be used solely by the driver's code, pty in this case. Now driver_data are managed only in the pty driver. devpts_pty_new is switched to accept what we used to dig out of tty_struct, i.e. device node number and index. This also removes a note about driver_data being set outside of the driver. Signed-off-by: Jiri Slaby Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index ec3bab716c05..7a20d673bb8a 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -545,12 +545,9 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_unlock(&allocated_ptys_lock); } -struct inode *devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) +struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, + void *priv) { - /* tty layer puts index from devpts_new_index() in here */ - int number = tty->index; - struct tty_driver *driver = tty->driver; - dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; @@ -559,23 +556,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) struct pts_mount_opts *opts = &fsi->mount_opts; char s[12]; - /* We're supposed to be given the slave end of a pty */ - BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); - BUG_ON(driver->subtype != PTY_TYPE_SLAVE); - inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - inode->i_ino = number + 3; + inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; init_special_inode(inode, S_IFCHR|opts->mode, device); - inode->i_private = tty; - tty->driver_data = inode; + inode->i_private = priv; - sprintf(s, "%d", number); + sprintf(s, "%d", index); mutex_lock(&root->d_inode->i_mutex); @@ -613,9 +605,8 @@ void *devpts_get_priv(struct inode *pts_inode) return priv; } -void devpts_pty_kill(struct tty_struct *tty) +void devpts_pty_kill(struct inode *inode) { - struct inode *inode = tty->driver_data; struct super_block *sb = pts_sb_from_inode(inode); struct dentry *root = sb->s_root; struct dentry *dentry; -- cgit v1.2.1 From 1dcb8e6d1c23f2e021639199fdf64d5b42689207 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 18 Oct 2012 22:26:30 +0200 Subject: TTY: devpts, document devpts inode operations Add kernel-doc texts for some devpts functions, i.e. document them. Signed-off-by: Jiri Slaby Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 7a20d673bb8a..472e6befc54d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -545,6 +545,15 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_unlock(&allocated_ptys_lock); } +/** + * devpts_pty_new -- create a new inode in /dev/pts/ + * @ptmx_inode: inode of the master + * @device: major+minor of the node to be created + * @index: used as a name of the node + * @priv: what's given back by devpts_get_priv + * + * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + */ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, void *priv) { @@ -585,6 +594,12 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, return inode; } +/** + * devpts_get_priv -- get private data for a slave + * @pts_inode: inode of the slave + * + * Returns whatever was passed as priv in devpts_pty_new for a given inode. + */ void *devpts_get_priv(struct inode *pts_inode) { struct dentry *dentry; @@ -605,6 +620,12 @@ void *devpts_get_priv(struct inode *pts_inode) return priv; } +/** + * devpts_pty_kill -- remove inode form /dev/pts/ + * @inode: inode of the slave to be removed + * + * This is an inverse operation of devpts_pty_new. + */ void devpts_pty_kill(struct inode *inode) { struct super_block *sb = pts_sb_from_inode(inode); -- cgit v1.2.1 From c6298038bcfc20710430a4ad069bb1f3f069997c Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 24 Oct 2012 23:43:21 +0400 Subject: tty, ioctls -- Add new ioctl definitions for tty flags fetching This patch defines new ioctl codes TIOCGPKT, TIOCGPTLCK, TIOCGEXCL for fetching pty's packet mode and locking state, and exclusive mode of tty. [ No real handlers for the codes though, this will be addressed in another patch for easier review and bisectability ] Signed-off-by: Cyrill Gorcunov CC: Alan Cox CC: "H. Peter Anvin" CC: Pavel Emelyanov CC: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- fs/compat_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f5054025f9da..89cf6014a967 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -842,6 +842,9 @@ COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) +COMPATIBLE_IOCTL(TIOCGPKT) +COMPATIBLE_IOCTL(TIOCGPTLCK) +COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) COMPATIBLE_IOCTL(TIOCSETD) -- cgit v1.2.1 From 98a1eebda3cb2a84ecf1f219bb3a95769033d1bf Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 10 Oct 2012 10:55:28 +0300 Subject: UBIFS: introduce categorized lprops counter This commit is a preparation for a subsequent bugfix. We introduce a counter for categorized lprops. Signed-off-by: Artem Bityutskiy Cc: stable@vger.kernel.org --- fs/ubifs/lprops.c | 6 ++++++ fs/ubifs/ubifs.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index e5a2a35a46dc..46190a7c42a6 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, default: ubifs_assert(0); } + lprops->flags &= ~LPROPS_CAT_MASK; lprops->flags |= cat; + c->in_a_category_cnt += 1; + ubifs_assert(c->in_a_category_cnt <= c->main_lebs); } /** @@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, default: ubifs_assert(0); } + + c->in_a_category_cnt -= 1; + ubifs_assert(c->in_a_category_cnt >= 0); } /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5486346d0a3f..d133c276fe05 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1183,6 +1183,8 @@ struct ubifs_debug_info; * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list + * @in_a_category_cnt: count of lprops which are in a certain category, which + * basically meants that they were loaded from the flash * * @ltab_lnum: LEB number of LPT's own lprops table * @ltab_offs: offset of LPT's own lprops table @@ -1412,6 +1414,7 @@ struct ubifs_info { struct list_head freeable_list; struct list_head frdi_idx_list; int freeable_cnt; + int in_a_category_cnt; int ltab_lnum; int ltab_offs; -- cgit v1.2.1 From a28ad42a4a0c6f302f488f26488b8b37c9b30024 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 9 Oct 2012 16:20:15 +0300 Subject: UBIFS: fix mounting problems after power cuts This is a bugfix for a problem with the following symptoms: 1. A power cut happens 2. After reboot, we try to mount UBIFS 3. Mount fails with "No space left on device" error message UBIFS complains like this: UBIFS error (pid 28225): grab_empty_leb: could not find an empty LEB The root cause of this problem is that when we mount, not all LEBs are categorized. Only those which were read are. However, the 'ubifs_find_free_leb_for_idx()' function assumes that all LEBs were categorized and 'c->freeable_cnt' is valid, which is a false assumption. This patch fixes the problem by teaching 'ubifs_find_free_leb_for_idx()' to always fall back to LPT scanning if no freeable LEBs were found. This problem was reported by few people in the past, but Brent Taylor was able to reproduce it and send me a flash image which cannot be mounted, which made it easy to hunt the bug. Kudos to Brent. Reported-by: Brent Taylor Signed-off-by: Artem Bityutskiy Cc: stable@vger.kernel.org --- fs/ubifs/find.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 28ec13af28d9..2dcf3d473fec 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c) if (!lprops) { lprops = ubifs_fast_find_freeable(c); if (!lprops) { - ubifs_assert(c->freeable_cnt == 0); - if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + /* + * The first condition means the following: go scan the + * LPT if there are uncategorized lprops, which means + * there may be freeable LEBs there (UBIFS does not + * store the information about freeable LEBs in the + * master node). + */ + if (c->in_a_category_cnt != c->main_lebs || + c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + ubifs_assert(c->freeable_cnt == 0); lprops = scan_for_leb_for_idx(c); if (IS_ERR(lprops)) { err = PTR_ERR(lprops); -- cgit v1.2.1 From 0f9831a89310cebba52d3f526e6cc5c2e403e6f1 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Oct 2012 14:01:43 -0700 Subject: ceph: fix dentry reference leak in encode_fh() Call to d_find_alias() needs a corresponding dput() This fixes http://tracker.newdream.net/issues/3271 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/export.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8e1b60e557b6..862887004d20 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } + if (dentry) + dput(dentry); return type; } -- cgit v1.2.1 From b000056a5a8d3f5a4a9fb80184a7ec14f86a43d4 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 25 Oct 2012 10:23:46 -0700 Subject: ceph: Fix NULL ptr crash in strlen() set_request_path_attr() checks for NULL ptr before calling strlen() This fixes http://tracker.newdream.net/issues/3404 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1bcf712655d9..62d2342eb267 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, } else if (rpath || rino) { *ino = rino; *ppath = rpath; - *pathlen = strlen(rpath); + *pathlen = rpath ? strlen(rpath) : 0; dout(" path %.*s\n", *pathlen, rpath); } -- cgit v1.2.1 From 1a25b1c4ce189e3926f2981f3302352a930086db Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 15 Oct 2012 17:20:17 -0400 Subject: Lock splice_read and splice_write functions Functions generic_file_splice_read and generic_file_splice_write access the pagecache directly. For block devices these functions must be locked so that block size is not changed while they are in progress. This patch is an additional fix for commit b87570f5d349 ("Fix a crash when block device is read and block size is changed at the same time") that locked aio_read, aio_write and mmap against block size change. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/block_dev.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index b3c1d3dae77d..1a1e5e3b1eaf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1661,6 +1661,39 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) return ret; } +static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_splice_read(file, ppos, pipe, len, flags); + + percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + +static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, + struct file *file, loff_t *ppos, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_splice_write(pipe, file, ppos, len, flags); + + percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1699,8 +1732,8 @@ const struct file_operations def_blk_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_read = blkdev_splice_read, + .splice_write = blkdev_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit v1.2.1 From ffb5387e85d528fb6d0d924abfa3fbf0fc484071 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 28 Oct 2012 22:24:57 -0400 Subject: ext4: fix unjournaled inode bitmap modification commit 119c0d4460b001e44b41dcf73dc6ee794b98bd31 changed ext4_new_inode() such that the inode bitmap was being modified outside a transaction, which could lead to corruption, and was discovered when journal_checksum found a bad checksum in the journal during log replay. Nix ran into this when using the journal_async_commit mount option, which enables journal checksumming. The ensuing journal replay failures due to the bad checksums led to filesystem corruption reported as the now infamous "Apparent serious progressive ext4 data corruption bug" [ Changed by tytso to only call ext4_journal_get_write_access() only when we're fairly certain that we're going to allocate the inode. ] I've tested this by mounting with journal_checksum and running fsstress then dropping power; I've also tested by hacking DM to create snapshots w/o first quiescing, which allows me to test journal replay repeatedly w/o actually power-cycling the box. Without the patch I hit a journal checksum error every time. With this fix it survives many iterations. Reported-by: Nix Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4facdd29a350..3a100e7a62a8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -725,6 +725,10 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -738,6 +742,11 @@ repeat_in_this_group: goto out; got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -771,11 +780,6 @@ got: goto fail; } - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; - BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); if (err) @@ -823,11 +827,6 @@ got: } ext4_unlock_group(sb, group); - BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; - BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) -- cgit v1.2.1 From 52eb5a900a9863a8b77a895f770e5d825c8e02c6 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Oct 2012 14:01:43 -0700 Subject: ceph: fix dentry reference leak in encode_fh() Call to d_find_alias() needs a corresponding dput() This fixes http://tracker.newdream.net/issues/3271 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/export.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8e1b60e557b6..862887004d20 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } + if (dentry) + dput(dentry); return type; } -- cgit v1.2.1 From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 28 Oct 2012 12:19:23 -0700 Subject: sched/autogroup: Fix crash on reboot when autogroup is disabled Due to these two commits: 8323f26ce342 sched: Fix race in task_group() 800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled ... autogroup scheduling's dynamic knobs are wrecked. With both patches applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to 800d4d30 not allowing autogroup to move things, and 8323f26ce making that the only way to switch runqueues. Remove most of the (dysfunctional) knobs and turn the remaining sched_autogroup_enabled knob readonly. If the user fiddles with cgroups hereafter, once tasks are moved, autogroup won't mess with them again unless they call setsid(). No knobs, no glitz, nada, just a cute little thing folks can turn on if they don't want to muck about with cgroups and/or systemd. Signed-off-by: Mike Galbraith Cc: Xiaotian Feng Cc: Peter Zijlstra Cc: Xiaotian Feng Cc: Linus Torvalds Cc: Andrew Morton Cc: Oleg Nesterov Cc: # v3.6 Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 ---------------------------------------------------------- 1 file changed, 78 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb73..bb1d9623bad2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1271,81 +1271,6 @@ static const struct file_operations proc_pid_sched_operations = { #endif -#ifdef CONFIG_SCHED_AUTOGROUP -/* - * Print out autogroup related information: - */ -static int sched_autogroup_show(struct seq_file *m, void *v) -{ - struct inode *inode = m->private; - struct task_struct *p; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - proc_sched_autogroup_show_task(p, m); - - put_task_struct(p); - - return 0; -} - -static ssize_t -sched_autogroup_write(struct file *file, const char __user *buf, - size_t count, loff_t *offset) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct task_struct *p; - char buffer[PROC_NUMBUF]; - int nice; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - - err = kstrtoint(strstrip(buffer), 0, &nice); - if (err < 0) - return err; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - - err = proc_sched_autogroup_set_nice(p, nice); - if (err) - count = err; - - put_task_struct(p); - - return count; -} - -static int sched_autogroup_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = single_open(filp, sched_autogroup_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; -} - -static const struct file_operations proc_pid_sched_autogroup_operations = { - .open = sched_autogroup_open, - .read = seq_read, - .write = sched_autogroup_write, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif /* CONFIG_SCHED_AUTOGROUP */ - static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3035,9 +2960,6 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -- cgit v1.2.1 From 08f05c49749ee655bef921d12160960a273aad47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 Oct 2012 03:37:48 +0000 Subject: Return the right error value when dup[23]() newfd argument is too large Jack Lin reports that the error return from dup3() for the RLIMIT_NOFILE case changed incorrectly after 3.6. The culprit is commit f33ff9927f42 ("take rlimit check to callers of expand_files()") which when it moved the "return -EMFILE" out to the caller, didn't notice that the dup3() had special code to turn the EMFILE return into EBADF. The replace_fd() helper that got added later then inherited the bug too. Reported-by: Jack Lin Signed-off-by: Al Viro [ Noted more bugs, wrote proper changelog, fixed up typos - Linus ] Signed-off-by: Linus Torvalds --- fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index d3b5fa80b71b..708d997a7748 100644 --- a/fs/file.c +++ b/fs/file.c @@ -900,7 +900,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) return __close_fd(files, fd); if (fd >= rlimit(RLIMIT_NOFILE)) - return -EMFILE; + return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); @@ -926,7 +926,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) - return -EMFILE; + return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); -- cgit v1.2.1 From 399f11c3d872bd748e1575574de265a6304c7c43 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Tue, 30 Oct 2012 16:06:35 -0400 Subject: NFS: Wait for session recovery to finish before returning Currently, we will schedule session recovery and then return to the caller of nfs4_handle_exception. This works for most cases, but causes a hang on the following test case: Client Server ------ ------ Open file over NFS v4.1 Write to file Expire client Try to lock file The server will return NFS4ERR_BADSESSION, prompting the client to schedule recovery. However, the client will continue placing lock attempts and the open recovery never seems to be scheduled. The simplest solution is to wait for session recovery to run before retrying the lock. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 68b21d81b7ac..d5fbf1f49d5f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -339,8 +339,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); nfs4_schedule_session_recovery(clp->cl_session, errorcode); - exception->retry = 1; - break; + goto wait_on_recovery; #endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: if (exception->timeout > HZ) { -- cgit v1.2.1 From 2240a9e2d013d8269ea425b73e1d7a54c7bc141f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 18:37:40 -0400 Subject: NFSv4.1: We must release the sequence id when we fail to get a session slot If we do not release the sequence id in cases where we fail to get a session slot, then we can deadlock if we hit a recovery scenario. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d5fbf1f49d5f..e0423bb5a880 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1571,9 +1571,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task)) - return; - rpc_call_start(task); + &data->o_res.seq_res, + task) != 0) + nfs_release_seqid(data->o_arg.seqid); + else + rpc_call_start(task); return; unlock_no_action: rcu_read_unlock(); @@ -2295,9 +2297,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (nfs4_setup_sequence(NFS_SERVER(inode), &calldata->arg.seq_args, &calldata->res.seq_res, - task)) - goto out; - rpc_call_start(task); + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + else + rpc_call_start(task); out: dprintk("%s: done!\n", __func__); } @@ -4544,9 +4547,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, &calldata->arg.seq_args, - &calldata->res.seq_res, task)) - return; - rpc_call_start(task); + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + else + rpc_call_start(task); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4691,7 +4696,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) - return; + goto out_release_lock_seqid; data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; @@ -4700,10 +4705,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, - &data->res.seq_res, task)) + &data->res.seq_res, + task) == 0) { + rpc_call_start(task); return; - rpc_call_start(task); - dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); + } + nfs_release_seqid(data->arg.open_seqid); +out_release_lock_seqid: + nfs_release_seqid(data->arg.lock_seqid); + dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); } static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) -- cgit v1.2.1 From 2b1bc308f492589f7d49012ed24561534ea2be8c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 18:53:23 -0400 Subject: NFSv4: nfs4_locku_done must release the sequence id If the state recovery machinery is triggered by the call to nfs4_async_handle_error() then we can deadlock. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e0423bb5a880..1465364501ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4531,6 +4531,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } + nfs_release_seqid(calldata->arg.seqid); } static void nfs4_locku_prepare(struct rpc_task *task, void *data) -- cgit v1.2.1 From 8d96b10639fb402357b75b055b1e82a65ff95050 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Oct 2012 12:16:01 +1100 Subject: NFS: fix bug in legacy DNS resolver. The DNS resolver's use of the sunrpc cache involves a 'ttl' number (relative) rather that a timeout (absolute). This confused me when I wrote commit c5b29f885afe890f953f7f23424045cdad31d3e4 "sunrpc: use seconds since boot in expiry cache" and I managed to break it. The effect is that any TTL is interpreted as 0, and nothing useful gets into the cache. This patch removes the use of get_expiry() - which really expects an expiry time - and uses get_uint() instead, treating the int correctly as a ttl. This fixes a regression that has been present since 2.6.37, causing certain NFS accesses in certain environments to incorrectly fail. Reported-by: Chuck Lever Tested-by: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dns_resolve.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 31c26c4dcc23..ca4b11ec87a2 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -217,7 +217,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) { char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; struct nfs_dns_ent key, *item; - unsigned long ttl; + unsigned int ttl; ssize_t len; int ret = -EINVAL; @@ -240,7 +240,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) key.namelen = len; memset(&key.h, 0, sizeof(key.h)); - ttl = get_expiry(&buf); + if (get_uint(&buf, &ttl) < 0) + goto out; if (ttl == 0) goto out; key.h.expiry_time = ttl + seconds_since_boot(); -- cgit v1.2.1 From 7175fe90153e6375082d65884fbb41ab3bbb4901 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Wed, 31 Oct 2012 16:05:48 +0800 Subject: nfs: Check whether a layout pointer is NULL before free it The new layout pointer in pnfs_find_alloc_layout() may be NULL because of out of memory. we must do some check work, otherwise pnfs_free_layout_hdr() will go wrong because it can not deal with a NULL pointer. Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fe624c91bd00..2878f97bd78d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -925,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino, if (likely(nfsi->layout == NULL)) { /* Won the race? */ nfsi->layout = new; return new; - } - pnfs_free_layout_hdr(new); + } else if (new != NULL) + pnfs_free_layout_hdr(new); out_existing: pnfs_get_layout_hdr(nfsi->layout); return nfsi->layout; -- cgit v1.2.1 From acce94e68a0f346115fd41cdc298197d2d5a59ad Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 16 Oct 2012 13:22:19 -0400 Subject: nfsv3: Make v3 mounts fail with ETIMEDOUTs instead EIO on mountd timeouts In very busy v3 environment, rpc.mountd can respond to the NULL procedure but not the MNT procedure in a timely manner causing the MNT procedure to time out. The problem is the mount system call returns EIO which causes the mount to fail, instead of ETIMEDOUT, which would cause the mount to be retried. This patch sets the RPC_TASK_SOFT|RPC_TASK_TIMEOUT flags to the rpc_call_sync() call in nfs_mount() which causes ETIMEDOUT to be returned on timed out connections. Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/mount_clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 8e65c7f1f87c..015f71f8f62c 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info) else msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; - status = rpc_call_sync(mnt_clnt, &msg, 0); + status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT); rpc_shutdown_client(mnt_clnt); if (status < 0) -- cgit v1.2.1 From 97a54868262da1629a3e65121e65b8e8c4419d9f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 21 Oct 2012 19:23:52 +0100 Subject: nfs: Show original device name verbatim in /proc/*/mount{s,info} Since commit c7f404b ('vfs: new superblock methods to override /proc/*/mount{s,info}'), nfs_path() is used to generate the mounted device name reported back to userland. nfs_path() always generates a trailing slash when the given dentry is the root of an NFS mount, but userland may expect the original device name to be returned verbatim (as it used to be). Make this canonicalisation optional and change the callers accordingly. [jrnieder@gmail.com: use flag instead of bool argument] Reported-and-tested-by: Chris Hiestand Reference: http://bugs.debian.org/669314 Signed-off-by: Ben Hutchings Cc: # v2.6.39+ Signed-off-by: Jonathan Nieder Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 5 +++-- fs/nfs/namespace.c | 19 ++++++++++++++----- fs/nfs/nfs4namespace.c | 3 ++- fs/nfs/super.c | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 59b133c5d652..a54fe51c1dfb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -353,8 +353,9 @@ extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ +#define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, - char *buffer, ssize_t buflen); + char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); @@ -498,7 +499,7 @@ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; - return nfs_path(&dummy, dentry, buffer, buflen); + return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 655925373b91..dd057bc6b65b 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -33,6 +33,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * @dentry - pointer to dentry * @buffer - result buffer * @buflen - length of buffer + * @flags - options (see below) * * Helper function for constructing the server pathname * by arbitrary hashed dentry. @@ -40,8 +41,14 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * This is mainly for use in figuring out the path on the * server side when automounting on top of an existing partition * and in generating /proc/mounts and friends. + * + * Supported flags: + * NFS_PATH_CANONICAL: ensure there is exactly one slash after + * the original device (export) name + * (if unset, the original name is returned verbatim) */ -char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, + unsigned flags) { char *end; int namelen; @@ -74,7 +81,7 @@ rename_retry: rcu_read_unlock(); goto rename_retry; } - if (*end != '/') { + if ((flags & NFS_PATH_CANONICAL) && *end != '/') { if (--buflen < 0) { spin_unlock(&dentry->d_lock); rcu_read_unlock(); @@ -91,9 +98,11 @@ rename_retry: return end; } namelen = strlen(base); - /* Strip off excess slashes in base string */ - while (namelen > 0 && base[namelen - 1] == '/') - namelen--; + if (flags & NFS_PATH_CANONICAL) { + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + } buflen -= namelen; if (buflen < 0) { spin_unlock(&dentry->d_lock); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 79fbb61ce202..1e09eb78543b 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -81,7 +81,8 @@ static char *nfs_path_component(const char *nfspath, const char *end) static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) { char *limit; - char *path = nfs_path(&limit, dentry, buffer, buflen); + char *path = nfs_path(&limit, dentry, buffer, buflen, + NFS_PATH_CANONICAL); if (!IS_ERR(path)) { char *path_component = nfs_path_component(path, limit); if (path_component) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e831bce49766..13c2a5be4765 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -771,7 +771,7 @@ int nfs_show_devname(struct seq_file *m, struct dentry *root) int err = 0; if (!page) return -ENOMEM; - devname = nfs_path(&dummy, root, page, PAGE_SIZE); + devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0); if (IS_ERR(devname)) err = PTR_ERR(devname); else -- cgit v1.2.1 From 324d003b0cd82151adbaecefef57b73f7959a469 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 30 Oct 2012 17:01:39 -0400 Subject: NFS: add nfs_sb_deactive_async to avoid deadlock Use nfs_sb_deactive_async instead of nfs_sb_deactive when in a workqueue context. This avoids a deadlock where rpc_shutdown_client loops forever in a workqueue kworker context, trying to kill all RPC tasks associated with the client, while one or more of these tasks have already been assigned to the same kworker (and will never run rpc_exit_task). This approach is needed because RPC tasks that have already been assigned to a kworker by queue_work cannot be canceled, as explained in the comment for workqueue.c:insert_wq_barrier. Signed-off-by: Weston Andros Adamson [Trond: add module_get/put.] Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 ++++- fs/nfs/internal.h | 1 + fs/nfs/nfs4proc.c | 2 +- fs/nfs/super.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/unlink.c | 2 +- 5 files changed, 56 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5c7325c5c5e6..6fa01aea2488 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -685,7 +685,10 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->dentry); - nfs_sb_deactive(sb); + if (is_sync) + nfs_sb_deactive(sb); + else + nfs_sb_deactive_async(sb); kfree(ctx->mdsthreshold); kfree(ctx); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a54fe51c1dfb..05521cadac2e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -351,6 +351,7 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +extern void nfs_sb_deactive_async(struct super_block *sb); /* namespace.c */ #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1465364501ba..8cfbac1a8d5e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2197,7 +2197,7 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - nfs_sb_deactive(sb); + nfs_sb_deactive_async(sb); kfree(calldata); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 13c2a5be4765..652d3f7176a9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -54,6 +54,7 @@ #include #include #include +#include #include @@ -415,6 +416,54 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); +static int nfs_deactivate_super_async_work(void *ptr) +{ + struct super_block *sb = ptr; + + deactivate_super(sb); + module_put_and_exit(0); + return 0; +} + +/* + * same effect as deactivate_super, but will do final unmount in kthread + * context + */ +static void nfs_deactivate_super_async(struct super_block *sb) +{ + struct task_struct *task; + char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_server *server = NFS_SB(sb); + struct nfs_client *clp = server->nfs_client; + + if (!atomic_add_unless(&sb->s_active, -1, 1)) { + rcu_read_lock(); + snprintf(buf, sizeof(buf), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + + __module_get(THIS_MODULE); + task = kthread_run(nfs_deactivate_super_async_work, sb, + "%s-deactivate-super", buf); + if (IS_ERR(task)) { + pr_err("%s: kthread_run: %ld\n", + __func__, PTR_ERR(task)); + /* make synchronous call and hope for the best */ + deactivate_super(sb); + module_put(THIS_MODULE); + } + } +} + +void nfs_sb_deactive_async(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + nfs_deactivate_super_async(sb); +} +EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); + /* * Deliver file system statistics to userspace */ diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 13cea637eff8..3f79c77153b8 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(sb); + nfs_sb_deactive_async(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) -- cgit v1.2.1 From f9b1ef5f06d65a01952169b67d474f7f0dcb0206 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 16:48:40 -0400 Subject: NFSv4: Initialise the NFSv4.1 slot table highest_used_slotid correctly Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8cfbac1a8d5e..091baab3eccf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5677,7 +5677,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, tbl->slots = new; tbl->max_slots = max_slots; } - tbl->highest_used_slotid = -1; /* no slot is currently used */ + tbl->highest_used_slotid = NFS4_NO_SLOT; for (i = 0; i < tbl->max_slots; i++) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); -- cgit v1.2.1 From eeee2b5fe1a9db15d3160da8048d9b89108753bf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Oct 2012 22:57:19 +0800 Subject: dlm: remove unused variable in *dlm_lowcomms_get_buffer() The variable users is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 331ea4f94efd..dd87a31bcc21 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) struct connection *con; struct writequeue_entry *e; int offset = 0; - int users = 0; con = nodeid2con(nodeid, allocation); if (!con) @@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) } else { offset = e->end; e->end += len; - users = e->users++; + e->users++; } spin_unlock(&con->writequeue_lock); @@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) spin_lock(&con->writequeue_lock); offset = e->end; e->end += len; - users = e->users++; + e->users++; list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); goto got_one; -- cgit v1.2.1 From a3de56bdb980c63b01662cac05d430db60ff4374 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Oct 2012 13:03:38 -0700 Subject: fs/dlm: remove CONFIG_EXPERIMENTAL This config item has not carried much meaning for a while now and is almost always enabled by default. As agreed during the Linux kernel summit, remove it. CC: Christine Caulfield Signed-off-by: Kees Cook Signed-off-by: David Teigland --- fs/dlm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1897eb1b4b6a..e4242c3f8486 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,6 +1,6 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" - depends on EXPERIMENTAL && INET + depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP help -- cgit v1.2.1 From 1375cb65e87b327a8dd4f920c3e3d837fb40e9c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 9 Oct 2012 14:50:52 +1100 Subject: xfs: growfs: don't read garbage for new secondary superblocks When updating new secondary superblocks in a growfs operation, the superblock buffer is read from the newly grown region of the underlying device. This is not guaranteed to be zero, so violates the underlying assumption that the unused parts of superblocks are zero filled. Get a new buffer for these secondary superblocks to ensure that the unused regions are zero filled correctly. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf7..4beaede43277 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -399,9 +399,26 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -423,7 +440,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); -- cgit v1.2.1 From 531c3bdc8662e1a83f8ec80dc3346b1284877c0a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Oct 2012 17:22:30 +1100 Subject: xfs: silence uninitialised f.file warning. Uninitialised variable build warning introduced by 2903ff0 ("switch simple cases of fget_light to fdget"), gcc is not smart enough to work out that the variable is not used uninitialised, and the commit removed the initialisation at declaration that the old variable had. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac6773..c1df3c623de2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; -- cgit v1.2.1 From cd856db69c88db438215244571957d812bdc6813 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Sat, 20 Oct 2012 11:08:19 -0300 Subject: xfs: Update inode alloc comments I found some out of date comments while studying the inode allocation code, so I believe it's worth to have these comments updated. It basically rewrites the comment regarding to "call_again" variable, which is not used anymore, but instead, callers of xfs_ialloc() decides if it needs to be called again relying only if ialloc_context is NULL or not. Also did some small changes in another comment that I thought to be pertinent to the current behaviour of these functions and some alignment on both comments. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 6 +++--- fs/xfs/xfs_inode.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c5c4ef4f2bdb..37753e1c8537 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -877,9 +877,9 @@ error0: * This function is designed to be called twice if it has to do an allocation * to make more free inodes. On the first call, *IO_agbp should be set to NULL. * If an inode is available without having to performn an allocation, an inode - * number is returned. In this case, *IO_agbp would be NULL. If an allocation - * needes to be done, xfs_dialloc would return the current AGI buffer in - * *IO_agbp. The caller should then commit the current transaction, allocate a + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a * new transaction, and call xfs_dialloc() again, passing in the previous value * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI * buffer is locked across the two calls, the second call is guaranteed to have diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bba8f37525b3..95f7a73b05cb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1104,16 +1104,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * -- cgit v1.2.1 From 998f40b550f257e436485291802fa938e4cf580f Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 2 Nov 2012 18:00:56 -0400 Subject: NFS4: nfs4_opendata_access should return errno Return errno - not an NFS4ERR_. This worked because NFS4ERR_ACCESS == EACCES. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 091baab3eccf..5eec4429970c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1749,7 +1749,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, /* even though OPEN succeeded, access is denied. Close the file */ nfs4_close_state(state, fmode); - return -NFS4ERR_ACCESS; + return -EACCES; } /* -- cgit v1.2.1 From 36960e440ccf94349c09fb944930d3bfe4bc473f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 3 Nov 2012 09:37:28 -0400 Subject: cifs: fix potential buffer overrun in cifs.idmap handling code The userspace cifs.idmap program generally works with the wbclient libs to generate binary SIDs in userspace. That program defines the struct that holds these values as having a max of 15 subauthorities. The kernel idmapping code however limits that value to 5. When the kernel copies those values around though, it doesn't sanity check the num_subauths value handed back from userspace or from the server. It's possible therefore for userspace to hand us back a bogus num_subauths value (or one that's valid, but greater than 5) that could cause the kernel to walk off the end of the cifs_sid->sub_auths array. Fix this by defining a new routine for copying sids and using that in all of the places that copy it. If we end up with a sid that's longer than expected then this approach will just lop off the "extra" subauths, but that's basically what the code does today already. Better approaches might be to fix this code to reject SIDs with >5 subauths, or fix it to handle the subauths array dynamically. At the same time, change the kernel to check the length of the data returned by userspace. If it's shorter than struct cifs_sid, reject it and return -EIO. If that happens we'll end up with fields that are basically uninitialized. Long term, it might make sense to redefine cifs_sid using a flexarray at the end, to allow for variable-length subauth lists, and teach the code to handle the case where the subauths array being passed in from userspace is shorter than 5 elements. Note too, that I don't consider this a security issue since you'd need a compromised cifs.idmap program. If you have that, you can do all sorts of nefarious stuff. Still, this is probably reasonable for stable. Cc: stable@kernel.org Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton --- fs/cifs/cifsacl.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fc783e264420..0fb15bbbe43c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -224,6 +224,13 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr) } } +static void +cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) +{ + memcpy(dst, src, sizeof(*dst)); + dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS); +} + static void id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, struct cifs_sid_id **psidid, char *typestr) @@ -248,7 +255,7 @@ id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, } } - memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); + cifs_copy_sid(&(*psidid)->sid, sidptr); (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); (*psidid)->refcount = 0; @@ -354,7 +361,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) * any fields of the node after a reference is put . */ if (test_bit(SID_ID_MAPPED, &psidid->state)) { - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + cifs_copy_sid(ssid, &psidid->sid); psidid->time = jiffies; /* update ts for accessing */ goto id_sid_out; } @@ -370,14 +377,14 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) if (IS_ERR(sidkey)) { rc = -EINVAL; cFYI(1, "%s: Can't map and id to a SID", __func__); + } else if (sidkey->datalen < sizeof(struct cifs_sid)) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); } else { lsid = (struct cifs_sid *)sidkey->payload.data; - memcpy(&psidid->sid, lsid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); - memcpy(ssid, &psidid->sid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); + cifs_copy_sid(&psidid->sid, lsid); + cifs_copy_sid(ssid, &psidid->sid); set_bit(SID_ID_MAPPED, &psidid->state); key_put(sidkey); kfree(psidid->sidstr); @@ -396,7 +403,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) return rc; } if (test_bit(SID_ID_MAPPED, &psidid->state)) - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + cifs_copy_sid(ssid, &psidid->sid); else rc = -EINVAL; } @@ -675,8 +682,6 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) static void copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset) { - int i; - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -692,26 +697,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); - - nowner_sid_ptr->revision = owner_sid_ptr->revision; - nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i]; + cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); - - ngroup_sid_ptr->revision = group_sid_ptr->revision; - ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; + cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); return; } @@ -1120,8 +1113,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(nowner_sid_ptr); return rc; } - memcpy(owner_sid_ptr, nowner_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); kfree(nowner_sid_ptr); *aclflag = CIFS_ACL_OWNER; } @@ -1139,8 +1131,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(ngroup_sid_ptr); return rc; } - memcpy(group_sid_ptr, ngroup_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); kfree(ngroup_sid_ptr); *aclflag = CIFS_ACL_GROUP; } -- cgit v1.2.1 From 4d1d0534f53863108fdea496288cb3310f88118d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 3 Nov 2012 10:32:37 +0800 Subject: ceph: Hold caps_list_lock when adjusting caps_{use, total}_count Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3251e9cc6401..2d0141e95c88 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { + spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } -- cgit v1.2.1 From 1fea73a86527d7ec463af6ff04b0830e1425ff6c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 11:24:57 -0400 Subject: NFS: Get rid of unnecessary asserts If the nfs_client fails to initialise correctly, then it will return an error condition. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 6 +----- fs/nfs/nfs4client.c | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8b39a42ac35e..c285e0a117e4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -277,7 +277,7 @@ void nfs_put_client(struct nfs_client *clp) nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); - BUG_ON(!list_empty(&clp->cl_superblocks)); + WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } @@ -1061,10 +1061,6 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); if (error < 0) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 6bacfde1319a..72717e67b34e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -713,10 +713,6 @@ static int nfs4_server_common_setup(struct nfs_server *server, struct nfs_fattr *fattr; int error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - /* data servers support only a subset of NFSv4.1 */ if (is_ds_only_client(server->nfs_client)) return -EPROTONOSUPPORT; -- cgit v1.2.1 From 7fc388460e8479c5b3120cb2fcf0e0daec70b93f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 11:51:21 -0400 Subject: NFS: Remove asserts from the NFS XDR code Convert the ones that are not trivial to check into WARN_ON_ONCE(). Remove checks for things such as NFS2_MAXPATHLEN, which are trivially done by the caller. Add a comment to the case of nfs3_xdr_enc_setacl3args. What is being done there is just wrong... Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 4 +--- fs/nfs/nfs3xdr.c | 7 +++---- fs/nfs/nfs4xdr.c | 6 ++---- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index d04f0df7be55..06b9df49f7f7 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -195,7 +195,6 @@ static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) { __be32 *p; - BUG_ON(fh->size != NFS2_FHSIZE); p = xdr_reserve_space(xdr, NFS2_FHSIZE); memcpy(p, fh->data, NFS2_FHSIZE); } @@ -388,7 +387,7 @@ static void encode_filename(struct xdr_stream *xdr, { __be32 *p; - BUG_ON(length > NFS2_MAXNAMLEN); + WARN_ON_ONCE(length > NFS2_MAXNAMLEN); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, name, length); } @@ -428,7 +427,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) { __be32 *p; - BUG_ON(length > NFS2_MAXPATHLEN); p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(length); xdr_write_pages(xdr, pages, 0, length); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cbe89400dfc..bffc32406fbf 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -198,7 +198,7 @@ static void encode_filename3(struct xdr_stream *xdr, { __be32 *p; - BUG_ON(length > NFS3_MAXNAMLEN); + WARN_ON_ONCE(length > NFS3_MAXNAMLEN); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, name, length); } @@ -238,7 +238,6 @@ out_overflow: static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, const u32 length) { - BUG_ON(length > NFS3_MAXPATHLEN); encode_uint32(xdr, length); xdr_write_pages(xdr, pages, 0, length); } @@ -388,7 +387,6 @@ out_overflow: */ static void encode_ftype3(struct xdr_stream *xdr, const u32 type) { - BUG_ON(type > NF3FIFO); encode_uint32(xdr, type); } @@ -443,7 +441,7 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh) { __be32 *p; - BUG_ON(fh->size > NFS3_FHSIZE); + WARN_ON_ONCE(fh->size > NFS3_FHSIZE); p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); } @@ -1339,6 +1337,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? args->acl_access : NULL, 1, 0); + /* FIXME: this is just broken */ BUG_ON(error < 0); error = nfsacl_encode(xdr->buf, base + error, args->inode, (args->mask & NFS_DFACL) ? diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 40836ee5dc3a..672d9b0ef2c5 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -936,7 +936,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, * but this is not required as a MUST for the server to do so. */ hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; - BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); + WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); p = reserve_space(xdr, 8); *p++ = cpu_to_be32(hdr->minorversion); @@ -955,7 +955,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, static void encode_nops(struct compound_hdr *hdr) { - BUG_ON(hdr->nops > NFS4_MAX_OPS); + WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS); *hdr->nops_p = htonl(hdr->nops); } @@ -1403,7 +1403,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: - BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } @@ -1621,7 +1620,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p = cpu_to_be32(FATTR4_WORD0_ACL); - BUG_ON(arg->acl_len % 4); p = reserve_space(xdr, 4); *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); -- cgit v1.2.1 From d3edcf96141a7729b12ef5ecab6d5f634e24c61a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 13:14:43 -0400 Subject: NFSv4: Remove the BUG_ON() from nfs4_get_lease_time_prepare()... An EAGAIN return value would be unexpected, but there is no reason to BUG... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5eec4429970c..14d86ef493a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5581,8 +5581,8 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, &data->args->la_seq_args, &data->res->lr_seq_res, task); - BUG_ON(ret == -EAGAIN); - rpc_call_start(task); + if (ret != -EAGAIN) + rpc_call_start(task); dprintk("<-- %s\n", __func__); } -- cgit v1.2.1 From eba24e1fe57df4e4cdee58af940f762eb336a113 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 14:47:33 -0400 Subject: NFSv4.1: Remove unused function last_byte_offset Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objlayout.c | 11 ----------- fs/nfs/pnfs.c | 11 ----------- 2 files changed, 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 874613545301..a9ebd817278b 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -148,17 +148,6 @@ end_offset(u64 start, u64 len) return end >= start ? end : NFS4_MAX_UINT64; } -/* last octet in a range */ -static inline u64 -last_byte_offset(u64 start, u64 len) -{ - u64 end; - - BUG_ON(!len); - end = start + len; - return end > start ? end - 1 : NFS4_MAX_UINT64; -} - static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, struct page ***p_pages, unsigned *p_pgbase, u64 offset, unsigned long count) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2878f97bd78d..dcbc9b20474b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -369,17 +369,6 @@ end_offset(u64 start, u64 len) return end >= start ? end : NFS4_MAX_UINT64; } -/* last octet in a range */ -static inline u64 -last_byte_offset(u64 start, u64 len) -{ - u64 end; - - BUG_ON(!len); - end = start + len; - return end > start ? end - 1 : NFS4_MAX_UINT64; -} - /* * is l2 fully contained in l1? * start1 end1 -- cgit v1.2.1 From bc5a89b337ee4b2fa6f577e7e1220d8c1ece71fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 14:58:04 -0400 Subject: NFSv4.1: Remove assertion BUG_ON()s from the files and generic layout code Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 13 ++++--------- fs/nfs/nfs4filelayoutdev.c | 2 -- fs/nfs/pnfs.c | 6 ++---- 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 2e45fd9c02a3..bfb28fa38e74 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -512,7 +512,6 @@ filelayout_read_pagelist(struct nfs_read_data *data) loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; - int status; dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", __func__, hdr->inode->i_ino, @@ -538,9 +537,8 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, &filelayout_read_call_ops, RPC_TASK_SOFTCONN); - BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -554,7 +552,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; - int status; /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); @@ -579,10 +576,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); - BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -909,7 +905,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { /* @@ -939,7 +935,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) goto out_mds; @@ -1187,7 +1183,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, */ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - BUG_ON(!list_empty(&b->written)); pnfs_put_lseg(b->wlseg); b->wlseg = NULL; } diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index a8eaa9b7bb0f..93e2530d7098 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -162,8 +162,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - BUG_ON(list_empty(&ds->ds_addrs)); - list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dcbc9b20474b..e7165d915362 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -634,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - BUG_ON(ctx == NULL); lgp = kzalloc(sizeof(*lgp), gfp_flags); if (lgp == NULL) return NULL; @@ -1115,7 +1114,6 @@ pnfs_update_layout(struct inode *ino, * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - BUG_ON(!list_empty(&lo->plh_layouts)); list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -1211,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r { u64 rd_size = req->wb_bytes; - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_read_mds(pgio); @@ -1240,7 +1238,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - BUG_ON(pgio->pg_lseg != NULL); + WARN_ON_ONCE(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_write_mds(pgio); -- cgit v1.2.1 From deed85e760c8c88cd984c5921dd8cb6b697b6134 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 15:02:01 -0400 Subject: NFS: Remove BUG_ON() calls from the generic writeback code ...and ensure that we set the return value for nfs_page_async_flush() to zero! (Reported-by: Dros Adamson) Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9347ab7c9574..f5bc8e11713b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -239,21 +239,18 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static int nfs_set_page_writeback(struct page *page) +static void nfs_set_page_writeback(struct page *page) { + struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); int ret = test_set_page_writeback(page); - if (!ret) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); + WARN_ON_ONCE(ret != 0); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + if (atomic_long_inc_return(&nfss->writeback) > + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); } - return ret; } static void nfs_end_page_writeback(struct page *page) @@ -315,10 +312,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, if (IS_ERR(req)) goto out; - ret = nfs_set_page_writeback(page); - BUG_ON(ret != 0); - BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + nfs_set_page_writeback(page); + WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); ret = pgio->pg_error; @@ -451,8 +448,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - BUG_ON (!NFS_WBACK_BUSY(req)); - spin_lock(&inode->i_lock); if (likely(!PageSwapCache(req->wb_page))) { set_page_private(req->wb_page, 0); @@ -1727,7 +1722,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - BUG_ON(!PageLocked(page)); for (;;) { wait_on_page_writeback(page); req = nfs_page_find_request(page); -- cgit v1.2.1 From 4ea8fed593218b658927b763f02941cd16c2ed9d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 15:47:41 -0400 Subject: NFSv4: Get rid of unnecessary BUG_ON()s Signed-off-by: Trond Myklebust --- fs/nfs/cache_lib.c | 1 - fs/nfs/callback_proc.c | 1 - fs/nfs/nfs4file.c | 1 - fs/nfs/nfs4proc.c | 14 ++++++-------- fs/nfs/nfs4state.c | 1 - 5 files changed, 6 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index dded26368111..862a2f16db64 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -118,7 +118,6 @@ int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd) struct dentry *dir; dir = rpc_d_lookup_sb(sb, "cache"); - BUG_ON(dir == NULL); ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); dput(dir); return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 76b4a7a3e559..0be08b964f38 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -216,7 +216,6 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); - BUG_ON(!list_empty(&lo->plh_bulk_recall)); list_add(&lo->plh_bulk_recall, &recall_list); } } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index afddd6639afb..e7699308364a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -20,7 +20,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct iattr attr; int err; - BUG_ON(inode != dentry->d_inode); /* * If no cached dentry exists or if it's negative, NFSv4 handled the * opens in ->lookup() or ->create(). diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 14d86ef493a0..6300cdd81101 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -206,7 +206,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent { __be32 *start, *p; - BUG_ON(readdir->count < 80); if (cookie > 2) { readdir->cookie = cookie; memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); @@ -415,7 +414,6 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp static void nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid) { - BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); @@ -2533,7 +2531,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); - BUG_ON(len < 0); + if (len < 0) + return len; for (i = 0; i < len; i++) { /* AUTH_UNIX is the default flavor if none was specified, @@ -3362,9 +3361,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, int mode = sattr->ia_mode; int status = -ENOMEM; - BUG_ON(!(sattr->ia_valid & ATTR_MODE)); - BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); - data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); if (data == NULL) goto out; @@ -3380,10 +3376,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, data->arg.ftype = NF4CHR; data->arg.u.device.specdata1 = MAJOR(rdev); data->arg.u.device.specdata2 = MINOR(rdev); + } else if (!S_ISSOCK(mode)) { + status = -EINVAL; + goto out_free; } status = nfs4_do_create(dir, dentry, data); - +out_free: nfs4_free_createdata(data); out: return status; @@ -5357,7 +5356,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred }; dprintk("--> %s\n", __func__); - BUG_ON(clp == NULL); res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); if (unlikely(res.session == NULL)) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c351e6b39838..e0a28dffd29d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1086,7 +1086,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid) */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { - BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid); switch (status) { case 0: break; -- cgit v1.2.1 From f48407ddd46bd215a7b4e1af3940e759a93640c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 16:19:30 -0400 Subject: NFS: Remove BUG_ON()s in the fs/nfs/inode.c Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6fa01aea2488..117183b1ee09 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -112,8 +112,8 @@ void nfs_clear_inode(struct inode *inode) /* * The following should never happen... */ - BUG_ON(nfs_have_writebacks(inode)); - BUG_ON(!list_empty(&NFS_I(inode)->open_files)); + WARN_ON_ONCE(nfs_have_writebacks(inode)); + WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); nfs_fscache_release_inode_cookie(inode); -- cgit v1.2.1 From 28d79ea33f52cae1ea04808e1ec52b8657b5d804 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 16:25:42 -0400 Subject: NFS: Remove the BUG_ON() in the mount code Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 015f71f8f62c..91a6faf811ac 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -169,6 +169,9 @@ int nfs_mount(struct nfs_mount_request *info) (info->hostname ? info->hostname : "server"), info->dirpath); + if (strlen(info->dirpath) > MNTPATHLEN) + return -ENAMETOOLONG; + if (info->noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; @@ -242,6 +245,9 @@ void nfs_umount(const struct nfs_mount_request *info) struct rpc_clnt *clnt; int status; + if (strlen(info->dirpath) > MNTPATHLEN) + return; + if (info->noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; @@ -283,7 +289,6 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) const u32 pathname_len = strlen(pathname); __be32 *p; - BUG_ON(pathname_len > MNTPATHLEN); p = xdr_reserve_space(xdr, 4 + pathname_len); xdr_encode_opaque(p, pathname, pathname_len); } -- cgit v1.2.1 From aad56de378b4c675e964a1ab44cf2e55d44d2865 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 17:14:38 -0400 Subject: lockd: Remove unnecessary BUG_ON()s in the xdr client code - Offset bound checks are done in the NFS client code. - So are filehandle size checks - The cookie length is a constant - The utsname()->nodename is already bounded Signed-off-by: Trond Myklebust --- fs/lockd/clnt4xdr.c | 8 -------- fs/lockd/clntxdr.c | 8 -------- 2 files changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 13ad1539fbf2..00ec0b9c94d1 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -64,10 +64,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock, { const struct file_lock *fl = &lock->fl; - BUG_ON(fl->fl_start > NLM4_OFFSET_MAX); - BUG_ON(fl->fl_end > NLM4_OFFSET_MAX && - fl->fl_end != OFFSET_MAX); - *l_offset = loff_t_to_s64(fl->fl_start); if (fl->fl_end == OFFSET_MAX) *l_len = 0; @@ -122,7 +118,6 @@ static void encode_netobj(struct xdr_stream *xdr, { __be32 *p; - BUG_ON(length > XDR_MAX_NETOBJ); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, data, length); } @@ -156,7 +151,6 @@ out_overflow: static void encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie) { - BUG_ON(cookie->len > NLM_MAXCOOKIELEN); encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); } @@ -198,7 +192,6 @@ out_overflow: */ static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) { - BUG_ON(fh->size > NFS3_FHSIZE); encode_netobj(xdr, (u8 *)&fh->data, fh->size); } @@ -336,7 +329,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name) u32 length = strlen(name); __be32 *p; - BUG_ON(length > NLM_MAXSTRLEN); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, name, length); } diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 982d2676e1f8..9a55797a1cd4 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -60,10 +60,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock, { const struct file_lock *fl = &lock->fl; - BUG_ON(fl->fl_start > NLM_OFFSET_MAX); - BUG_ON(fl->fl_end > NLM_OFFSET_MAX && - fl->fl_end != OFFSET_MAX); - *l_offset = loff_t_to_s32(fl->fl_start); if (fl->fl_end == OFFSET_MAX) *l_len = 0; @@ -119,7 +115,6 @@ static void encode_netobj(struct xdr_stream *xdr, { __be32 *p; - BUG_ON(length > XDR_MAX_NETOBJ); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, data, length); } @@ -153,7 +148,6 @@ out_overflow: static void encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie) { - BUG_ON(cookie->len > NLM_MAXCOOKIELEN); encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); } @@ -195,7 +189,6 @@ out_overflow: */ static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) { - BUG_ON(fh->size != NFS2_FHSIZE); encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); } @@ -330,7 +323,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name) u32 length = strlen(name); __be32 *p; - BUG_ON(length > NLM_MAXSTRLEN); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, name, length); } -- cgit v1.2.1 From 326ce0a6da64df3eb8f13a623304ab8033d38c12 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 17:21:04 -0400 Subject: lockd: Remove trivial BUG_ON()s from the NSM code Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3d7e09bcc0e9..3c2cfc683631 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -154,8 +154,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, .rpc_resp = res, }; - BUG_ON(clnt == NULL); - memset(res, 0, sizeof(*res)); msg.rpc_proc = &clnt->cl_procinfo[proc]; @@ -466,7 +464,6 @@ static void encode_nsm_string(struct xdr_stream *xdr, const char *string) const u32 len = strlen(string); __be32 *p; - BUG_ON(len > SM_MAXSTRLEN); p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, string, len); } -- cgit v1.2.1 From a2d30a54df968c01fff4a412ac23f55832f45fe6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 17:26:20 -0400 Subject: lockd: Remove BUG_ON()s in fs/lockd/host.c - Convert the non-trivial ones into WARN_ON_ONCE(). - Remove the trivial refcounting BUGs Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f9b22e58f78f..0e17090c310f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -177,9 +177,6 @@ static void nlm_destroy_host_locked(struct nlm_host *host) dprintk("lockd: destroy host %s\n", host->h_name); - BUG_ON(!list_empty(&host->h_lockowners)); - BUG_ON(atomic_read(&host->h_count)); - hlist_del_init(&host->h_hash); nsm_unmonitor(host); @@ -289,13 +286,12 @@ void nlmclnt_release_host(struct nlm_host *host) dprintk("lockd: release client host %s\n", host->h_name); - BUG_ON(atomic_read(&host->h_count) < 0); - BUG_ON(host->h_server); + WARN_ON_ONCE(host->h_server); if (atomic_dec_and_test(&host->h_count)) { - BUG_ON(!list_empty(&host->h_lockowners)); - BUG_ON(!list_empty(&host->h_granted)); - BUG_ON(!list_empty(&host->h_reclaim)); + WARN_ON_ONCE(!list_empty(&host->h_lockowners)); + WARN_ON_ONCE(!list_empty(&host->h_granted)); + WARN_ON_ONCE(!list_empty(&host->h_reclaim)); mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); @@ -412,8 +408,7 @@ void nlmsvc_release_host(struct nlm_host *host) dprintk("lockd: release server host %s\n", host->h_name); - BUG_ON(atomic_read(&host->h_count) < 0); - BUG_ON(!host->h_server); + WARN_ON_ONCE(!host->h_server); atomic_dec(&host->h_count); } -- cgit v1.2.1 From 262693482cd56f887174ad1c0c2bb4f94ffad0ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Oct 2012 17:28:45 -0400 Subject: lockd: Remove BUG_ON()s from fs/lockd/clntproc.c Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 05d29124c6ab..54f9e6ce0430 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -141,7 +141,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) static void nlmclnt_release_lockargs(struct nlm_rqst *req) { - BUG_ON(req->a_args.lock.fl.fl_ops != NULL); + WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL); } /** @@ -465,7 +465,6 @@ static const struct file_lock_operations nlmclnt_lock_ops = { static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) { - BUG_ON(fl->fl_ops != NULL); fl->fl_u.nfs_fl.state = 0; fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); -- cgit v1.2.1 From 3798f47aa276b332c30da499cb4df4577e2f8872 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 5 Nov 2012 11:39:32 +0000 Subject: cifs: Do not lookup hashed negative dentry in cifs_atomic_open We do not need to lookup a hashed negative directory since we have already revalidated it before and have found it to be fine. This also prevents a crash in cifs_lookup() when it attempts to rehash the already hashed negative lookup dentry. The patch has been tested using the reproducer at https://bugzilla.redhat.com/show_bug.cgi?id=867344#c28 Cc: # 3.6.x Reported-by: Vit Zahradka Signed-off-by: Sachin Prabhu --- fs/cifs/dir.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7c0a81283645..d3671f2acb29 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -398,7 +398,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * in network traffic in the other paths. */ if (!(oflags & O_CREAT)) { - struct dentry *res = cifs_lookup(inode, direntry, 0); + struct dentry *res; + + /* + * Check for hashed negative dentry. We have already revalidated + * the dentry and it is fine. No need to perform another lookup. + */ + if (!d_unhashed(direntry)) + return -ENOENT; + + res = cifs_lookup(inode, direntry, 0); if (IS_ERR(res)) return PTR_ERR(res); -- cgit v1.2.1 From 22cddde104d715600a4c218bf9224923208afe90 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 5 Nov 2012 11:07:23 -0800 Subject: ceph: Fix i_size update race ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR dirty before data is copied to page cache and inode size is updated. If ceph_check_caps() flushes the dirty cap before the inode size is updated, MDS can miss the new inode size. The fix is move ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call __ceph_mark_dirty_caps() after inode size is updated. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/addr.c | 51 ++++++++++++++++++++++++++++++++++++---- fs/ceph/file.c | 73 ++++++++++++++++++++++++---------------------------------- 2 files changed, 77 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e4583faa..21a07187df05 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1078,23 +1078,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r; + int r, want, got = 0; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, len, inode->i_size); + r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); + if (r < 0) + return r; + dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); + if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { + ceph_put_cap_refs(ci, got); + return -EAGAIN; + } do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) - return -ENOMEM; - *pagep = page; + if (!page) { + r = -ENOMEM; + break; + } dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r) + page_cache_release(page); } while (r == -EAGAIN); + if (r) { + ceph_put_cap_refs(ci, got); + } else { + *pagep = page; + *(int *)fsdata = got; + } return r; } @@ -1108,10 +1136,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; + int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1134,6 +1164,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); + if (copied > 0) { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5840d2aaed15..d415096800a6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; - int want, got = 0; - int ret, err; + int got = 0; + int ret, err, written; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; retry_snap: + written = 0; if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) return -ENOSPC; __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - inode->i_size); - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); - if (ret < 0) - goto out_put; - - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); - - if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) { - ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, - &iocb->ki_pos); - } else { - /* - * buffered write; drop Fw early to avoid slow - * revocation if we get stuck on balance_dirty_pages - */ - int dirty; - - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - ceph_put_cap_refs(ci, got); + /* + * try to do a buffered write. if we don't have sufficient + * caps, we'll get -EAGAIN from generic_file_aio_write, or a + * short write if we only get caps for some pages. + */ + if (!(iocb->ki_filp->f_flags & O_DIRECT) && + !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && + !(fi->flags & CEPH_F_SYNC)) { ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (ret >= 0) + written = ret; + if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + written - 1, 1); if (err < 0) ret = err; } + if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) + goto out; + } - if (dirty) - __mark_inode_dirty(inode, dirty); + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, inode->i_size); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); + if (ret < 0) goto out; - } + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); + ret = ceph_sync_write(file, iov->iov_base + written, + iov->iov_len - written, &iocb->ki_pos); if (ret >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); @@ -777,13 +767,10 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } - -out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", -- cgit v1.2.1 From aaaf68c5629108f6078ab458d34a661143ea6857 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:08 +0100 Subject: GFS2: Fix an unchecked error from gfs2_rs_alloc Check the return value of gfs2_rs_alloc(ip) and avoid a possible null pointer dereference. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 40c4b0d42fa8..c5af8e18f27a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) struct gfs2_quota_data **qd; int error; - if (ip->i_res == NULL) - gfs2_rs_alloc(ip); + if (ip->i_res == NULL) { + error = gfs2_rs_alloc(ip); + if (error) + return error; + } qd = ip->i_res->rs_qa_qd; -- cgit v1.2.1 From cd0ed19fb614cb1315c0a510ec6c163d8324fd82 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:09 +0100 Subject: GFS2: Fix possible null pointer deref in gfs2_rs_alloc Despite the return value from kmem_cache_zalloc() being checked, the error wasn't being returned until after a possible null pointer dereference. This patch returns the error immediately, allowing the removal of the error variable. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3cc402ce6fea..43d1a20bdbe4 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) */ int gfs2_rs_alloc(struct gfs2_inode *ip) { - int error = 0; struct gfs2_blkreserv *res; if (ip->i_res) @@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); if (!res) - error = -ENOMEM; + return -ENOMEM; RB_CLEAR_NODE(&res->rs_node); @@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) else ip->i_res = res; up_write(&ip->i_rw_mutex); - return error; + return 0; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) -- cgit v1.2.1 From 73738a77f42c2d7f53fd61f73272c9dd6f520897 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:10 +0100 Subject: GFS2: Clean up some unused assignments Cleans up two cases where variables were assigned values but then never used again. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 -- fs/gfs2/lops.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0def0504afc1..377a68dbd066 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -677,10 +677,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t writesize = iov_length(iov, nr_segs); struct dentry *dentry = file->f_dentry; struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_sbd *sdp; int ret; - sdp = GFS2_SB(file->f_mapping->host); ret = gfs2_rs_alloc(ip); if (ret) return ret; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8ff95a2d54ee..01e444b5b2bd 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -621,7 +621,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { - struct gfs2_log_descriptor *ld; struct gfs2_meta_header *mh; unsigned int offset; struct list_head *head = &sdp->sd_log_le_revoke; @@ -634,7 +633,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); - ld = page_address(page); offset = sizeof(struct gfs2_log_descriptor); list_for_each_entry(bd, head, bd_list) { -- cgit v1.2.1 From 3a238adefb8c5b8cb8cde0ce689d513306176ff4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 11:39:07 +0200 Subject: GFS2: Require user to provide argument for FITRIM When the fstrim_range argument is not provided by user in FITRIM ioctl we should just return EFAULT and not promoting bad behaviour by filling the structure in kernel. Let the user deal with it. Signed-off-by: Lukas Czerner Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 43d1a20bdbe4..b6bbf718d6c3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1270,11 +1270,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (argp == NULL) { - r.start = 0; - r.len = ULLONG_MAX; - r.minlen = 0; - } else if (copy_from_user(&r, argp, sizeof(r))) + if (copy_from_user(&r, argp, sizeof(r))) return -EFAULT; ret = gfs2_rindex_update(sdp); @@ -1323,7 +1319,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) out: r.len = trimmed << 9; - if (argp && copy_to_user(argp, &r, sizeof(r))) + if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; return ret; -- cgit v1.2.1 From 076f0faa764ab3a5a32fc726ae05e2de0e66151d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 11:39:08 +0200 Subject: GFS2: Fix FITRIM argument handling Currently implementation in gfs2 uses FITRIM arguments as it were in file system blocks units which is wrong. The FITRIM arguments (fstrim_range.start, fstrim_range.len and fstrim_range.minlen) are actually in bytes. Moreover, check for start argument beyond the end of file system, len argument being smaller than file system block and minlen argument being bigger than biggest resource group were missing. This commit converts the code to convert FITRIM argument to file system blocks and also adds appropriate checks mentioned above. All the problems were recognised by xfstests 251 and 260. Signed-off-by: Lukas Czerner Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b6bbf718d6c3..38fe18f2f055 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1262,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) int ret = 0; u64 amt; u64 trimmed = 0; + u64 start, end, minlen; unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1277,8 +1279,18 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (ret) return ret; - rgd = gfs2_blk2rgrpd(sdp, r.start, 0); - rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, + q->limits.discard_granularity) >> bs_shift; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); + + if (end <= start || + minlen > sdp->sd_max_rg_data || + start > rgd_end->rd_data0 + rgd_end->rd_data) + return -EINVAL; while (1) { @@ -1290,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Trim each bitmap in the rgrp */ for (x = 0; x < rgd->rd_length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); if (ret) { gfs2_glock_dq_uninit(&gh); goto out; -- cgit v1.2.1 From 3d1626889a64bd5a661544d582036a0a02104a60 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 6 Nov 2012 00:49:28 -0600 Subject: GFS2: Don't call file_accessed() with a shared glock file_accessed() was being called by gfs2_mmap() with a shared glock. If it needed to update the atime, it was crashing because it dirtied the inode in gfs2_dirty_inode() without holding an exclusive lock. gfs2_dirty_inode() checked if the caller was already holding a glock, but it didn't make sure that the glock was in the exclusive state. Now, instead of calling file_accessed() while holding the shared lock in gfs2_mmap(), file_accessed() is called after grabbing and releasing the glock to update the inode. If file_accessed() needs to update the atime, it will grab an exclusive lock in gfs2_dirty_inode(). gfs2_dirty_inode() now also checks to make sure that if the calling process has already locked the glock, it has an exclusive lock. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 12 +++++------- fs/gfs2/super.c | 3 ++- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 377a68dbd066..e056b4ce4877 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error == 0) { - file_accessed(file); - gfs2_glock_dq(&i_gh); - } - gfs2_holder_uninit(&i_gh); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); if (error) return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); } vma->vm_ops = &gfs2_vm_ops; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bc737261f234..d6488674d916 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } need_unlock = 1; - } + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE, 0); -- cgit v1.2.1 From 96e5d1d3adf56f1c7eeb07258f6a1a0a7ae9c489 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 7 Nov 2012 00:38:06 -0600 Subject: GFS2: Test bufdata with buffer locked and gfs2_log_lock held In gfs2_trans_add_bh(), gfs2 was testing if a there was a bd attached to the buffer without having the gfs2_log_lock held. It was then assuming it would stay attached for the rest of the function. However, without either the log lock being held of the buffer locked, __gfs2_ail_flush() could detach bd at any time. This patch moves the locking before the test. If there isn't a bd already attached, gfs2 can safely allocate one and attach it before locking. There is no way that the newly allocated bd could be on the ail list, and thus no way for __gfs2_ail_flush() to detach it. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 14 ++------------ fs/gfs2/trans.c | 8 ++++++++ 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 01e444b5b2bd..9ceccb1595a3 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct gfs2_meta_header *mh; struct gfs2_trans *tr; - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); tr = current->journal_info; tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; @@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) sdp->sd_log_num_buf++; list_add(&bd->bd_list, &sdp->sd_log_le_buf); tr->tr_num_buf_new++; -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } static void gfs2_check_magic(struct buffer_head *bh) @@ -775,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); if (tr) tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); if (gfs2_is_jdata(ip)) { @@ -791,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) } else { list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); } -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } /** diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index adbd27875ef9..413627072f36 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; + lock_buffer(bh); + gfs2_log_lock(sdp); bd = bh->b_private; if (bd) gfs2_assert(sdp, bd->bd_gl == gl); else { + gfs2_log_unlock(sdp); + unlock_buffer(bh); gfs2_attach_bufdata(gl, bh, meta); bd = bh->b_private; + lock_buffer(bh); + gfs2_log_lock(sdp); } lops_add(sdp, bd); + gfs2_log_unlock(sdp); + unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -- cgit v1.2.1 From 8eae1ca0034cce78a24738087a32adb1ddb66aa7 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 15 Oct 2012 10:57:02 +0100 Subject: GFS2: Review bug traps in glops.c Two of the bug traps here could really be warnings. The others are converted from BUG() to GLOCK_BUG_ON() since we'll most likely need to know the glock state in order to debug any issues which arise. As a result of this, __dump_glock has to be renamed and is no longer static. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 ++++------ fs/gfs2/glock.h | 54 +++++++++++++++++++++++++++--------------------------- fs/gfs2/glops.c | 10 +++++----- 3 files changed, 36 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e6c2fd53cab2..e543871ec82f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -55,8 +55,6 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); -static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); -#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static struct dentry *gfs2_root; @@ -1013,7 +1011,7 @@ trap_recursive: printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); - __dump_glock(NULL, gl); + gfs2_dump_glock(NULL, gl); BUG(); } @@ -1508,7 +1506,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { int ret; spin_lock(&gl->gl_spin); - ret = __dump_glock(seq, gl); + ret = gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); return ret; } @@ -1655,7 +1653,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) } /** - * __dump_glock - print information about a glock + * gfs2_dump_glock - print information about a glock * @seq: The seq_file struct * @gl: the glock * @@ -1672,7 +1670,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 307ac31df781..fd580b7861d5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) return NULL; } -int gfs2_glock_get(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - int create, struct gfs2_glock **glp); -void gfs2_glock_hold(struct gfs2_glock *gl); -void gfs2_glock_put_nolock(struct gfs2_glock *gl); -void gfs2_glock_put(struct gfs2_glock *gl); -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_reinit(unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq(struct gfs2_holder *gh); -int gfs2_glock_poll(struct gfs2_holder *gh); -int gfs2_glock_wait(struct gfs2_holder *gh); -void gfs2_glock_dq(struct gfs2_holder *gh); -void gfs2_glock_dq_wait(struct gfs2_holder *gh); - -void gfs2_glock_dq_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq_num(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, struct gfs2_holder *gh); - -int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); - -__printf(2, 3) +extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); +extern void gfs2_glock_put_nolock(struct gfs2_glock *gl); +extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + unsigned flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +extern void gfs2_holder_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq(struct gfs2_holder *gh); +extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq(struct gfs2_holder *gh); +extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, + struct gfs2_holder *gh); +extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) +extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 32cc4fde975c..0a3e7c7e26c1 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_trans_add_revoke(sdp, bd); } - BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); + GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); } @@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) tr.tr_ip = (unsigned long)__builtin_return_address(0); sb_start_intwrite(sdp->sd_vfs); gfs2_log_reserve(sdp, tr.tr_reserved); - BUG_ON(current->journal_info); + WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; __gfs2_ail_flush(gl, 0); @@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; - BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_sbd, gl); filemap_fdatawrite(metamapping); @@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct address_space *mapping = gfs2_glock2aspace(gl); - BUG_ON(!(flags & DIO_METADATA)); + WARN_ON_ONCE(!(flags & DIO_METADATA)); gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages(mapping, 0); @@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl) if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; - BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_sbd, gl); filemap_fdatawrite(metamapping); -- cgit v1.2.1 From a68a0a352a0209467268dfddffe02db08b97ddb4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 19 Oct 2012 08:32:51 -0400 Subject: GFS2: Speed up gfs2_rbm_from_block This patch is a rewrite of function gfs2_rbm_from_block. Rather than looping to find the right bitmap, the code now does a few simple math calculations. I compared the performance of both algorithms side by side and the new algorithm is noticeably faster. Sample instrumentation output from a "fast" machine: 5 million calls: millisec spent: Orig: 166 New: 113 5 million calls: millisec spent: Orig: 189 New: 114 In addition, I ran postmark (on a somewhat slowr CPU) before the after the new algorithm was put in place and postmark showed a decent improvement: Before the new algorithm: ------------------------- Time: 645 seconds total 584 seconds of transactions (171 per second) Files: 150087 created (232 per second) Creation alone: 100000 files (2083 per second) Mixed with transactions: 50087 files (85 per second) 49995 read (85 per second) 49991 appended (85 per second) 150087 deleted (232 per second) Deletion alone: 100174 files (7705 per second) Mixed with transactions: 49913 files (85 per second) Data: 273.42 megabytes read (434.08 kilobytes per second) 852.13 megabytes written (1.32 megabytes per second) With the new algorithm: ----------------------- Time: 599 seconds total 530 seconds of transactions (188 per second) Files: 150087 created (250 per second) Creation alone: 100000 files (1886 per second) Mixed with transactions: 50087 files (94 per second) 49995 read (94 per second) 49991 appended (94 per second) 150087 deleted (250 per second) Deletion alone: 100174 files (6260 per second) Mixed with transactions: 49913 files (94 per second) Data: 273.42 megabytes read (467.42 kilobytes per second) 852.13 megabytes written (1.42 megabytes per second) Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 3 +++ fs/gfs2/rgrp.c | 21 ++++++++++++--------- 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3d469d37345e..24bb0b857860 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -621,6 +621,7 @@ struct gfs2_sbd { u32 sd_hash_bsize_shift; u32 sd_hash_ptrs; /* Number of pointers in a hash block */ u32 sd_qc_per_block; + u32 sd_blocks_per_bitmap; u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e443966c8106..0e3554edb8f2 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(struct gfs2_quota_change); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ /* Compute maximum reservation required to add a entry to a directory */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 38fe18f2f055..669b89b95ccc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -251,22 +251,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) { u64 rblock = block - rbm->rgd->rd_data0; - u32 goal = (u32)rblock; - int x; + u32 x; if (WARN_ON_ONCE(rblock > UINT_MAX)) return -EINVAL; if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) return -E2BIG; - for (x = 0; x < rbm->rgd->rd_length; x++) { - rbm->bi = rbm->rgd->rd_bits + x; - if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) { - rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY); - break; - } - } + rbm->bi = rbm->rgd->rd_bits; + rbm->offset = (u32)(rblock); + /* Check if the block is within the first block */ + if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) + return 0; + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->bi += x; return 0; } -- cgit v1.2.1 From 06dfc30641370094ed522bf5949b2a326fe2741b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 24 Oct 2012 14:41:05 -0400 Subject: GFS2: Rename glops go_xmote_th to go_sync [Editorial: This is a nit, but has been a minor irritation for a long time:] This patch renames glops structure item for go_xmote_th to go_sync. The functionality is unchanged; it's just for readability. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 4 ++-- fs/gfs2/glops.c | 6 +++--- fs/gfs2/incore.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e543871ec82f..6114571a979a 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -535,8 +535,8 @@ __acquires(&gl->gl_spin) (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); spin_unlock(&gl->gl_spin); - if (glops->go_xmote_th) - glops->go_xmote_th(gl); + if (glops->go_sync) + glops->go_sync(gl); if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0a3e7c7e26c1..e86fe26c12d2 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = { }; const struct gfs2_glock_operations gfs2_inode_glops = { - .go_xmote_th = inode_go_sync, + .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, @@ -546,7 +546,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = rgrp_go_sync, + .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, .go_unlock = gfs2_rgrp_go_unlock, @@ -556,7 +556,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { }; const struct gfs2_glock_operations gfs2_trans_glops = { - .go_xmote_th = trans_go_sync, + .go_sync = trans_go_sync, .go_xmote_bh = trans_go_xmote_bh, .go_demote_ok = trans_go_demote_ok, .go_type = LM_TYPE_NONDISK, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 24bb0b857860..a46f03485936 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -205,7 +205,7 @@ struct lm_lockname { struct gfs2_glock_operations { - void (*go_xmote_th) (struct gfs2_glock *gl); + void (*go_sync) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); -- cgit v1.2.1 From bcd97c06308cbfa8b46e11762ea116300cdce772 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 09:58:42 +0000 Subject: GFS2: Add test for resource group congestion status This patch uses information gathered by the recent glock statistics patch in order to derrive a boolean verdict on the congestion status of a resource group. This is then used when making decisions on which resource group to choose during block allocation. The aim is to avoid resource groups which are heavily contended by other nodes, while still ensuring locality of access wherever possible. Once a reservation has been made in a particular resource group we continue to use that resource group until a new reservation is required. This should help to ensure that we do not change resource groups too often. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 669b89b95ccc..bdf3e644baae 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1681,6 +1681,88 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip return; } +/** + * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested + * @rgd: The rgrp in question + * @loops: An indication of how picky we can be (0=very, 1=less so) + * + * This function uses the recently added glock statistics in order to + * figure out whether a parciular resource group is suffering from + * contention from multiple nodes. This is done purely on the basis + * of timings, since this is the only data we have to work with and + * our aim here is to reject a resource group which is highly contended + * but (very important) not to do this too often in order to ensure that + * we do not land up introducing fragmentation by changing resource + * groups when not actually required. + * + * The calculation is fairly simple, we want to know whether the SRTTB + * (i.e. smoothed round trip time for blocking operations) to acquire + * the lock for this rgrp's glock is significantly greater than the + * time taken for resource groups on average. We introduce a margin in + * the form of the variable @var which is computed as the sum of the two + * respective variences, and multiplied by a factor depending on @loops + * and whether we have a lot of data to base the decision on. This is + * then tested against the square difference of the means in order to + * decide whether the result is statistically significant or not. + * + * Returns: A boolean verdict on the congestion status + */ + +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) +{ + const struct gfs2_glock *gl = rgd->rd_gl; + const struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_lkstats *st; + s64 r_dcount, l_dcount; + s64 r_srttb, l_srttb; + s64 srttb_diff; + s64 sqr_diff; + s64 var; + + preempt_disable(); + st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; + r_srttb = st->stats[GFS2_LKS_SRTTB]; + r_dcount = st->stats[GFS2_LKS_DCOUNT]; + var = st->stats[GFS2_LKS_SRTTVARB] + + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + preempt_enable(); + + l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + + if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + return false; + + srttb_diff = r_srttb - l_srttb; + sqr_diff = srttb_diff * srttb_diff; + + var *= 2; + if (l_dcount < 8 || r_dcount < 8) + var *= 2; + if (loops == 1) + var *= 2; + + return ((srttb_diff < 0) && (sqr_diff > var)); +} + +/** + * gfs2_rgrp_used_recently + * @rs: The block reservation with the rgrp to test + * @msecs: The time limit in milliseconds + * + * Returns: True if the rgrp glock has been used within the time limit + */ +static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, + u64 msecs) +{ + u64 tdiff; + + tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), + rs->rs_rbm.rgd->rd_gl->gl_dstamp)); + + return tdiff > (msecs * 1000 * 1000); +} + static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; @@ -1707,7 +1789,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = ip->i_res; - int error = 0, rg_locked, flags = LM_FLAG_TRY; + int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; @@ -1731,13 +1813,18 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { rg_locked = 0; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &rs->rs_rgd_gh); - if (error == GLR_TRYFAILED) - goto next_rgrp; if (unlikely(error)) return error; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { error = update_rgrp_lvb(rs->rs_rbm.rgd); if (unlikely(error)) { @@ -1789,7 +1876,6 @@ next_rgrp: * then this checks for some less likely conditions before * trying again. */ - flags &= ~LM_FLAG_TRY; loops++; /* Check that fs hasn't grown if writing to rindex */ if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { -- cgit v1.2.1 From c9aecf73717f55e41ac11682a50bef8594547025 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 10:30:22 +0000 Subject: GFS2: Use proper allocation context for new inodes Rather than using the parent directory's allocation context, this patch allocated the new inode earlier in the process and then uses it to contain all the information required. As a result, we can now use the new inode's own allocation context to allocate it rather than having to use the parent directory's context. This give us a lot more flexibility in where the inode is placed on disk. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 171 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 92 insertions(+), 79 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 381893ceefa4..749b05a960ef 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, - unsigned int *uid, unsigned int *gid) +static void munge_mode_uid_gid(const struct gfs2_inode *dip, + struct inode *inode) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { - if (S_ISDIR(*mode)) - *mode |= S_ISUID; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISUID; else if (dip->i_inode.i_uid != current_fsuid()) - *mode &= ~07111; - *uid = dip->i_inode.i_uid; + inode->i_mode &= ~07111; + inode->i_uid = dip->i_inode.i_uid; } else - *uid = current_fsuid(); + inode->i_uid = current_fsuid(); if (dip->i_inode.i_mode & S_ISGID) { - if (S_ISDIR(*mode)) - *mode |= S_ISGID; - *gid = dip->i_inode.i_gid; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + inode->i_gid = dip->i_inode.i_gid; } else - *gid = current_fsgid(); + inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) +static int alloc_dinode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; int dblocks = 1; - error = gfs2_inplace_reserve(dip, RES_DINODE); + error = gfs2_inplace_reserve(ip, RES_DINODE); if (error) goto out; @@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); + ip->i_no_formal_ino = ip->i_generation; + ip->i_inode.i_ino = ip->i_no_addr; + ip->i_goal = ip->i_no_addr; gfs2_trans_end(sdp); out_ipreserv: - gfs2_inplace_release(dip); + gfs2_inplace_release(ip); out: return error; } @@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh, /** * init_dinode - Fill in a new dinode structure * @dip: The directory this inode is being created in - * @gl: The glock covering the new inode - * @inum: The inode number - * @mode: The file permissions - * @uid: The uid of the new inode - * @gid: The gid of the new inode - * @generation: The generation number of the new inode - * @dev: The device number (if a device node) + * @ip: The inode * @symname: The symlink destination (if a symlink) - * @size: The inode size (ignored for directories) * @bhp: The buffer head (returned to caller) * */ -static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - const struct gfs2_inum_host *inum, umode_t mode, - unsigned int uid, unsigned int gid, - const u64 *generation, dev_t dev, const char *symname, - unsigned size, struct buffer_head **bhp) +static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; struct timespec tv = CURRENT_TIME; - dibh = gfs2_meta_new(gl, inum->no_addr); - gfs2_trans_add_bh(gl, dibh, 1); + dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; - di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); - di->di_num.no_addr = cpu_to_be64(inum->no_addr); - di->di_mode = cpu_to_be32(mode); - di->di_uid = cpu_to_be32(uid); - di->di_gid = cpu_to_be32(gid); + di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + di->di_mode = cpu_to_be32(ip->i_inode.i_mode); + di->di_uid = cpu_to_be32(ip->i_inode.i_uid); + di->di_gid = cpu_to_be32(ip->i_inode.i_gid); di->di_nlink = 0; - di->di_size = cpu_to_be64(size); + di->di_size = cpu_to_be64(ip->i_inode.i_size); di->di_blocks = cpu_to_be64(1); di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); - di->di_major = cpu_to_be32(MAJOR(dev)); - di->di_minor = cpu_to_be32(MINOR(dev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); - di->di_generation = cpu_to_be64(*generation); + di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); + di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr); + di->di_generation = cpu_to_be64(ip->i_generation); di->di_flags = 0; di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); + di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0); di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; @@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); - switch(mode & S_IFMT) { + switch(ip->i_inode.i_mode & S_IFMT) { case S_IFREG: if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || gfs2_tune_get(sdp, gt_new_files_jdata)) @@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, gfs2_init_dir(dibh, dip); break; case S_IFLNK: - memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size); break; } @@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, *bhp = dibh; } -static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - umode_t mode, const struct gfs2_inum_host *inum, - const u64 *generation, dev_t dev, const char *symname, - unsigned int size, struct buffer_head **bhp) +static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname, struct buffer_head **bhp) { + struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - unsigned int uid, gid; int error; - munge_mode_uid_gid(dip, &mode, &uid, &gid); error = gfs2_rindex_update(sdp); if (error) return error; - error = gfs2_quota_lock(dip, uid, gid); + error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid); if (error) return error; - error = gfs2_quota_check(dip, uid, gid); + error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid); if (error) goto out_quota; @@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, if (error) goto out_quota; - init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); - gfs2_quota_change(dip, +1, uid, gid); + init_dinode(dip, ip, symname, bhp); + gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid); gfs2_trans_end(sdp); out_quota: @@ -657,19 +647,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; + struct gfs2_glock *io_gl; int error; - u64 generation; struct buffer_head *bh = NULL; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; - /* We need a reservation to allocate the new dinode block. The - directory ip temporarily points to the reservation, but this is - being done to get a set of contiguous blocks for the new dinode. - Since this is a create, we don't have a sizehint yet, so it will - have to use the minimum reservation size. */ error = gfs2_rs_alloc(dip); if (error) return error; @@ -688,45 +672,63 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock; - error = alloc_dinode(dip, &inum.no_addr, &generation); + inode = new_inode(sdp->sd_vfs); + ip = GFS2_I(inode); + error = gfs2_rs_alloc(ip); if (error) - goto fail_gunlock; - inum.no_formal_ino = generation; + goto fail_free_inode; + + set_bit(GIF_INVALID, &ip->i_flags); + inode->i_mode = mode; + inode->i_rdev = dev; + inode->i_size = size; + munge_mode_uid_gid(dip, inode); + ip->i_goal = dip->i_goal; - error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, - LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = alloc_dinode(ip); if (error) - goto fail_gunlock; + goto fail_free_inode; - error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) - goto fail_gunlock2; + goto fail_free_inode; - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino, 0); - if (IS_ERR(inode)) + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_free_inode; + + error = make_dinode(dip, ip, symname, &bh); + if (error) goto fail_gunlock2; - ip = GFS2_I(inode); - error = gfs2_inode_refresh(ip); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_gunlock2; - error = gfs2_rs_alloc(ip); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + gfs2_set_iop(inode); + insert_inode_hash(inode); + + error = gfs2_inode_refresh(ip); + if (error) + goto fail_gunlock3; + error = gfs2_acl_create(dip, inode); if (error) - goto fail_gunlock2; + goto fail_gunlock3; error = gfs2_security_init(dip, ip, name); if (error) - goto fail_gunlock2; + goto fail_gunlock3; error = link_dinode(dip, name, ip); if (error) - goto fail_gunlock2; + goto fail_gunlock3; if (bh) brelse(bh); @@ -739,8 +741,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); return 0; +fail_gunlock3: + gfs2_glock_dq_uninit(ghs + 1); + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + goto fail_gunlock; + fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); +fail_free_inode: + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + gfs2_rs_delete(ip); + free_inode_nonrcu(inode); + inode = NULL; fail_gunlock: gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { @@ -748,7 +762,6 @@ fail_gunlock: iput(inode); } fail: - gfs2_rs_delete(dip); if (bh) brelse(bh); return error; -- cgit v1.2.1 From 9dbe9610b9df4efe0946299804ed46bb8f91dec2 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 10:37:10 +0000 Subject: GFS2: Add Orlov allocator Just like ext3, this works on the root directory and any directory with the +T flag set. Also, just like ext3, any subdirectory created in one of the just mentioned cases will be allocated to a random resource group (GFS2 equivalent of a block group). If you are creating a set of directories, each of which will contain a job running on a different node, then by setting +T on the parent directory before creating the subdirectories, each will land up in a different resource group, and thus resource group contention between nodes will be kept to a minimum. Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 4 ++-- fs/gfs2/inode.c | 17 +++++++++++------ fs/gfs2/quota.c | 4 ++-- fs/gfs2/rgrp.c | 19 ++++++++++++++++++- fs/gfs2/rgrp.h | 3 ++- fs/gfs2/xattr.c | 2 +- 8 files changed, 38 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 01c4975da4bc..30de4f2a2ea9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, goto out_unlock; requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, requested); + error = gfs2_inplace_reserve(ip, requested, 0); if (error) goto out_qunlock; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1fd3ae237bdd..de70e52caf3a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1178,7 +1178,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) return error; - error = gfs2_inplace_reserve(ip, 1); + error = gfs2_inplace_reserve(ip, 1, 0); if (error) goto do_grow_qunlock; unstuff = 1; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e056b4ce4877..dfe2d8cb9b2c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); + ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); if (ret) goto out_quota_unlock; @@ -825,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 749b05a960ef..ef3ce00bb528 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -385,13 +385,13 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *ip) +static int alloc_dinode(struct gfs2_inode *ip, u32 flags) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; int dblocks = 1; - error = gfs2_inplace_reserve(ip, RES_DINODE); + error = gfs2_inplace_reserve(ip, RES_DINODE, flags); if (error) goto out; @@ -560,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); if (error) goto fail_quota_locks; @@ -650,6 +650,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_glock *io_gl; int error; struct buffer_head *bh = NULL; + u32 aflags = 0; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -685,7 +686,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, munge_mode_uid_gid(dip, inode); ip->i_goal = dip->i_goal; - error = alloc_dinode(ip); + if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + (dip->i_diskflags & GFS2_DIF_TOPDIR)) + aflags |= GFS2_AF_ORLOV; + + error = alloc_dinode(ip, aflags); if (error) goto fail_free_inode; @@ -897,7 +902,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); if (error) goto out_gunlock_q; @@ -1378,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; - error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0); if (error) goto out_gunlock_q; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c5af8e18f27a..6bbf64f0f5b6 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -816,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); - error = gfs2_inplace_reserve(ip, reserved); + error = gfs2_inplace_reserve(ip, reserved, 0); if (error) goto out_alloc; @@ -1605,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, blocks); + error = gfs2_inplace_reserve(ip, blocks, 0); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bdf3e644baae..99a619788c65 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1763,6 +1764,15 @@ static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, return tdiff > (msecs * 1000 * 1000); } +static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 skip; + + get_random_bytes(&skip, sizeof(skip)); + return skip % sdp->sd_rgrps; +} + static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; @@ -1784,7 +1794,7 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) +int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1792,6 +1802,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; + u32 skip = 0; if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; @@ -1805,6 +1816,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) } else { rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } + if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV)) + skip = gfs2_orlov_skip(ip); if (rs->rs_rbm.rgd == NULL) return -EBADSLT; @@ -1813,6 +1826,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { rg_locked = 0; + if (skip && skip--) + goto next_rgrp; if (!gfs2_rs_active(rs) && (loops < 2) && gfs2_rgrp_used_recently(rs, 1000) && gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) @@ -1871,6 +1886,8 @@ next_rgrp: /* Find the next rgrp, and continue looking */ if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) continue; + if (skip) + continue; /* If we've scanned all the rgrps, but found no free blocks * then this checks for some less likely conditions before diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 24077958dcf6..842185853f6b 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); +#define GFS2_AF_ORLOV 1 +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index db330e5518cd..76c144b3c9bb 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - error = gfs2_inplace_reserve(ip, blks); + error = gfs2_inplace_reserve(ip, blks, 0); if (error) goto out_gunlock_q; -- cgit v1.2.1 From 4c05f9ad4d168098b7ce3ffa7098283f94811ed6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:41 +1100 Subject: xfs: invalidate allocbt blocks moved to the free list When we free a block from the alloc btree tree, we move it to the freelist held in the AGFL and mark it busy in the busy extent tree. This typically happens when we merge btree blocks. Once the transaction is committed and checkpointed, the block can remain on the free list for an indefinite amount of time. Now, this isn't the end of the world at this point - if the free list is shortened, the buffer is invalidated in the transaction that moves it back to free space. If the buffer is allocated as metadata from the free list, then all the modifications getted logged, and we have no issues, either. And if it gets allocated as userdata direct from the freelist, it gets invalidated and so will never get written. However, during the time it sits on the free list, pressure on the log can cause the AIL to be pushed and the buffer that covers the block gets pushed for write. IOWs, we end up writing a freed metadata block to disk. Again, this isn't the end of the world because we know from the above we are only writing to free space. The problem, however, is for validation callbacks. If the block was on old btree root block, then the level of the block is going to be higher than the current tree root, and so will fail validation. There may be other inconsistencies in the block as well, and currently we don't care because the block is in free space. Shutting down the filesystem because a freed block doesn't pass write validation, OTOH, is rather unfriendly. So, make sure we always invalidate buffers as they move from the free space trees to the free list so that we guarantee they never get written to disk while on the free list. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8f..f7876c6d6165 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } -- cgit v1.2.1 From b6aff29f3af7437635ec3d66af9115bb17ba561f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:42 +1100 Subject: xfs: don't vmap inode cluster buffers during free Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 95f7a73b05cb..965598eb308c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1760,7 +1760,8 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; -- cgit v1.2.1 From 137fff09b7924507871f8e6294dfe57b7a880332 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 14:23:12 +1100 Subject: xfs: fix buffer shudown reference count mismatch When we shut down the filesystem, we have to unpin and free all the buffers currently active in the CIL. To do this we unpin and remove them in one operation as a result of a failed iclogbuf write. For buffers, we do this removal via a simultated IO completion of after marking the buffer stale. At the time we do this, we have two references to the buffer - the active LRU reference and the buf log item. The LRU reference is removed by marking the buffer stale, and the active CIL reference is by the xfs_buf_iodone() callback that is run by xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone callback). However, ioend processing requires one more reference - that of the IO that it is completing. We don't have this reference, so we free the buffer prematurely and use it after it is freed. For buffers marked with XBF_ASYNC, this leads to assert failures in xfs_buf_rele() on debug kernels because the b_hold count is zero. Fix this by making sure we take the necessary IO reference before starting IO completion processing on the stale buffer, and set the XBF_ASYNC flag to ensure that IO completion processing removes all the active references from the buffer to ensure it is fully torn down. Cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed911196..becf4a97efc6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); -- cgit v1.2.1 From 009507b052fa391618eccf9e8c9f484407fd9018 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:44 +1100 Subject: xfs: fix reading of wrapped log data Commit 4439647 ("xfs: reset buffer pointers before freeing them") in 3.0-rc1 introduced a regression when recovering log buffers that wrapped around the end of log. The second part of the log buffer at the start of the physical log was being read into the header buffer rather than the data buffer, and hence recovery was seeing garbage in the data buffer when it got to the region of the log buffer that was incorrectly read. Cc: # 3.0.x, 3.2.x, 3.4.x 3.6.x Reported-by: Torsten Kaiser Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 651c98859b04..3e06333d4bd1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3542,7 +3542,7 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; -- cgit v1.2.1 From 69a58a43f74eb2cb23d9bce2524dae33c289a40f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Oct 2012 14:11:45 -0500 Subject: xfs: report projid32bit feature in geometry call When xfs gained the projid32bit feature, it was never added to the FSGEOMETRY ioctl feature flags, so it's not queryable without this patch. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 3 ++- fs/xfs/xfs_fsops.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c13fed8c394a..0948c043443b 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 4beaede43277..7b0a997cf62b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -97,7 +97,9 @@ xfs_fs_geometry( (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; -- cgit v1.2.1 From 216b6cbdcbd86b1db0754d58886b466ae31f5a63 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Aug 2012 10:10:10 -0400 Subject: exportfs: add FILEID_INVALID to indicate invalid fid_type This commit adds FILEID_INVALID = 0xff in fid_type to indicate invalid fid_type It avoids using magic number 255 Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Signed-off-by: J. Bruce Fields --- fs/exportfs/expfs.c | 4 ++-- fs/fhandle.c | 2 +- fs/nfsd/nfsfh.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 29ab099e3e08..f1f1c59c2966 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -322,10 +322,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, if (parent && (len < 4)) { *max_len = 4; - return 255; + return FILEID_INVALID; } else if (len < 2) { *max_len = 2; - return 255; + return FILEID_INVALID; } len = 2; diff --git a/fs/fhandle.c b/fs/fhandle.c index f775bfdd6e4a..26f12b95702a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path, handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || - (retval == 255) || (retval == -ENOSPC)) { + (retval == FILEID_INVALID) || (retval == -ENOSPC)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 032af381b3aa..814afaa4458a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, if (inode) _fh_update(fhp, exp, dentry); - if (fhp->fh_handle.fh_fileid_type == 255) { + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { fh_put(fhp); return nfserr_opnotsupp; } @@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp) goto out; _fh_update(fhp, fhp->fh_export, dentry); - if (fhp->fh_handle.fh_fileid_type == 255) + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) return nfserr_opnotsupp; } out: -- cgit v1.2.1 From 01f6c8fd949f3a25a2617e6e1579a5c974b1cabf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Oct 2012 22:44:21 +0800 Subject: nfsd4: remove unused variable in nfsd4_delegreturn() The variable inode is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d0237f872cc4..620ff8143751 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3807,12 +3807,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; struct nfs4_stid *s; - struct inode *inode; __be32 status; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - inode = cstate->current_fh.fh_dentry->d_inode; nfs4_lock_state(); status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion); -- cgit v1.2.1 From 3c40794b2dd0f355ef4e6bf8d85af5dcd7da7ece Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Wed, 24 Oct 2012 14:44:19 +0800 Subject: nfs: fix wrong object type in lockowner_slab The object type in the cache of lockowner_slab is wrong, and it is better to fix it. Cc: stable@vger.kernel.org Signed-off-by: Yanchuan Nian Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 620ff8143751..fba2996ed511 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2340,7 +2340,7 @@ nfsd4_init_slabs(void) if (openowner_slab == NULL) goto out_nomem; lockowner_slab = kmem_cache_create("nfsd4_lockowners", - sizeof(struct nfs4_openowner), 0, 0, NULL); + sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) goto out_nomem; file_slab = kmem_cache_create("nfsd4_files", -- cgit v1.2.1 From ae7095a7c44b4cda963e3d4059788ff60e119684 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 1 Oct 2012 17:50:56 -0400 Subject: nfsd4: helper function for getting mounted_on ino Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fd548d155088..af65fda7685a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2014,6 +2014,22 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) return 0; } + +static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +{ + struct path path = exp->ex_path; + int err; + + path_get(&path); + while (follow_up(&path)) { + if (path.dentry != path.mnt->mnt_root) + break; + } + err = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); + return err; +} + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. @@ -2430,18 +2446,8 @@ out_acl: * and this is the root of a cross-mounted filesystem. */ if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) { - struct path path = exp->ex_path; - path_get(&path); - while (follow_up(&path)) { - if (path.dentry != path.mnt->mnt_root) - break; - } - err = vfs_getattr(path.mnt, path.dentry, &stat); - path_put(&path); - if (err) - goto out_nfserr; - } + dentry == exp->ex_path.mnt->mnt_root) + get_parent_attributes(exp, &stat); WRITE64(stat.ino); } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { -- cgit v1.2.1 From 7c1f8b65af4bda8eb53cdfe4965cbcfd7fb20c7d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 1 Nov 2012 16:54:01 -0400 Subject: nfsd4: remove unused init_session return Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fba2996ed511..bc8507c23525 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -945,7 +945,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) return new; } -static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) +void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; @@ -978,7 +978,6 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); } - return new; } /* caller must hold client_lock */ -- cgit v1.2.1 From 7fa10cd12df3ec0873a5db0d8dc8e978423b87dc Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 16 Oct 2012 12:39:33 -0400 Subject: nfsd4: don't BUG in delegation break callback These conditions would indeed indicate bugs in the code, but if we want to hear about them we're likely better off warning and returning than immediately dying while holding file_lock_lock. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bc8507c23525..db7258c13423 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2554,9 +2554,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; struct nfs4_delegation *dp; - BUG_ON(!fp); - /* We assume break_lease is only called once per lease: */ - BUG_ON(fp->fi_had_conflict); + if (!fp) { + WARN(1, "(%p)->fl_owner NULL\n", fl); + return; + } + if (fp->fi_had_conflict) { + WARN(1, "duplicate break on %p\n", fp); + return; + } /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned -- cgit v1.2.1 From fae5096ad217db2e3368e980c1d86223f786856b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 26 Oct 2012 16:04:08 -0400 Subject: nfsd: assume writeable exportabled filesystems have f_sync I don't really see how you could claim to support nfsd and not support fsync somehow. And in practice a quick look through the exportable filesystems suggests the only ones without an ->fsync are read-only (efs, isofs, squashfs) or in-memory (shmem). Also, performing a write and then returning an error if the sync fails (as we would do here in the wgather case) seems unhelpful to clients. Also remove an incorrect comment. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c120b48ec305..ed3eb59b607e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1020,21 +1020,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, inode = dentry->d_inode; exp = fhp->fh_export; - /* - * Request sync writes if - * - the sync export option has been set, or - * - the client requested O_SYNC behavior (NFSv3 feature). - * - The file system doesn't support fsync(). - * When NFSv2 gathered writes have been configured for this volume, - * flushing the data to disk is handled separately below. - */ use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); - if (!file->f_op->fsync) {/* COMMIT3 cannot work */ - stable = 2; - *stablep = 2; /* FILE_SYNC */ - } - if (!EX_ISSYNC(exp)) stable = 0; if (stable && !use_wgather) { -- cgit v1.2.1 From face15025ffdf664de95e86ae831544154d26c9c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 26 Oct 2012 16:12:31 -0400 Subject: nfsd: use vfs_fsync_range(), not O_SYNC, for stable writes NFSv4 shares the same struct file across multiple writes. (And we'd like NFSv2 and NFSv3 to do that as well some day.) So setting O_SYNC on the struct file as a way to request a synchronous write doesn't work. Instead, do a vfs_fsync_range() in that case. Reported-by: Peter Staubach Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ed3eb59b607e..b584205b25b4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1024,11 +1024,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (!EX_ISSYNC(exp)) stable = 0; - if (stable && !use_wgather) { - spin_lock(&file->f_lock); - file->f_flags |= O_SYNC; - spin_unlock(&file->f_lock); - } /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); @@ -1044,8 +1039,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (inode->i_mode & (S_ISUID | S_ISGID)) kill_suid(dentry); - if (stable && use_wgather) - host_err = wait_for_concurrent_writes(file); + if (stable) { + if (use_wgather) + host_err = wait_for_concurrent_writes(file); + else + host_err = vfs_fsync_range(file, offset, offset+*cnt, 0); + } out_nfserr: dprintk("nfsd: write complete host_err=%d\n", host_err); -- cgit v1.2.1 From acb2887e04c2140c2c63c8bf94e0b446efcc7001 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 27 Mar 2012 14:50:26 -0400 Subject: nfsd4: clean up callback security parsing Move the callback parsing into a separate function. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 118 +++++++++++++++++++++++++++++------------------------- fs/nfsd/state.h | 9 ++++- 2 files changed, 70 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index af65fda7685a..511f980b605c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -422,6 +422,67 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_TAIL; } +static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + DECODE_HEAD; + u32 dummy; + char *machine_name; + int i; + int nr_secflavs; + + /* callback_sec_params4 */ + READ_BUF(4); + READ32(nr_secflavs); + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + READ32(dummy); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + READ32(dummy); + + /* machine name */ + READ32(dummy); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + READ32(cbs->uid); + READ32(cbs->gid); + + /* more gids */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + READ32(dummy); + /* gcbp_handle_from_server */ + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) { DECODE_HEAD; @@ -1237,11 +1298,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { DECODE_HEAD; - u32 dummy; - char *machine_name; - int i; - int nr_secflavs; READ_BUF(16); COPYMEM(&sess->clientid, 8); @@ -1282,58 +1339,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, goto xdr_error; } - READ_BUF(8); + READ_BUF(4); READ32(sess->callback_prog); - - /* callback_sec_params4 */ - READ32(nr_secflavs); - for (i = 0; i < nr_secflavs; ++i) { - READ_BUF(4); - READ32(dummy); - switch (dummy) { - case RPC_AUTH_NULL: - /* Nothing to read */ - break; - case RPC_AUTH_UNIX: - READ_BUF(8); - /* stamp */ - READ32(dummy); - - /* machine name */ - READ32(dummy); - READ_BUF(dummy); - SAVEMEM(machine_name, dummy); - - /* uid, gid */ - READ_BUF(8); - READ32(sess->uid); - READ32(sess->gid); - - /* more gids */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy * 4); - break; - case RPC_AUTH_GSS: - dprintk("RPC_AUTH_GSS callback secflavor " - "not supported!\n"); - READ_BUF(8); - /* gcbp_service */ - READ32(dummy); - /* gcbp_handle_from_server */ - READ32(dummy); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - /* gcbp_handle_from_client */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); - break; - default: - dprintk("Illegal callback secflavor\n"); - return nfserr_inval; - } - } + nfsd4_decode_cb_sec(argp, &sess->cb_sec); DECODE_TAIL; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e036894bce57..df33e781f36c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -150,6 +150,11 @@ struct nfsd4_channel_attrs { u32 rdma_attrs; }; +struct nfsd4_cb_sec { + u32 uid; + u32 gid; +}; + struct nfsd4_create_session { clientid_t clientid; struct nfs4_sessionid sessionid; @@ -158,8 +163,7 @@ struct nfsd4_create_session { struct nfsd4_channel_attrs fore_channel; struct nfsd4_channel_attrs back_channel; u32 callback_prog; - u32 uid; - u32 gid; + struct nfsd4_cb_sec cb_sec; }; struct nfsd4_bind_conn_to_session { @@ -192,6 +196,7 @@ struct nfsd4_session { struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_cb_sec se_cb_sec; struct list_head se_conns; u32 se_cb_prog; u32 se_cb_seq_nr; -- cgit v1.2.1 From c6bb3ca27d78b902baa143b931a8d9ef53298afa Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 1 Nov 2012 16:31:02 -0400 Subject: nfsd4: use callback security parameters in create_session We're currently ignoring the callback security parameters specified in create_session, and just assuming the client wants auth_sys, because that's all the current linux client happens to care about. But this could cause us callbacks to fail to a client that wanted something different. For now, all we're doing is no longer ignoring the uid and gid passed in the auth_sys case. Further patches will add support for auth_null and gss (and possibly use more of the auth_sys information; the spec wants us to use exactly the credential we're passed, though it's hard to imagine why a client would care). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 49 +++++++++++++++++++++++++++++++++++-------------- fs/nfsd/nfs4state.c | 1 + fs/nfsd/state.h | 1 + 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index bdf29c96e4cd..b32639ee0a42 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -630,6 +630,31 @@ static int max_cb_time(void) return max(nfsd4_lease/10, (time_t)1) * HZ; } +static struct rpc_cred *callback_cred; + +int set_callback_cred(void) +{ + if (callback_cred) + return 0; + callback_cred = rpc_lookup_machine_cred("nfs"); + if (!callback_cred) + return -ENOMEM; + return 0; +} + +struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) +{ + if (clp->cl_minorversion == 0) { + return get_rpccred(callback_cred); + } else { + struct rpc_auth *auth = client->cl_auth; + struct auth_cred acred = {}; + + acred.uid = ses->se_cb_sec.uid; + acred.gid = ses->se_cb_sec.gid; + return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0); + } +} static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { @@ -648,6 +673,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), }; struct rpc_clnt *client; + struct rpc_cred *cred; if (clp->cl_minorversion == 0) { if (!clp->cl_cred.cr_principal && @@ -675,7 +701,13 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c PTR_ERR(client)); return PTR_ERR(client); } + cred = get_backchannel_cred(clp, client, ses); + if (IS_ERR(cred)) { + rpc_shutdown_client(client); + return PTR_ERR(cred); + } clp->cl_cb_client = client; + clp->cl_cb_cred = cred; return 0; } @@ -714,18 +746,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { .rpc_call_done = nfsd4_cb_probe_done, }; -static struct rpc_cred *callback_cred; - -int set_callback_cred(void) -{ - if (callback_cred) - return 0; - callback_cred = rpc_lookup_machine_cred("nfs"); - if (!callback_cred) - return -ENOMEM; - return 0; -} - static struct workqueue_struct *callback_wq; static void run_nfsd4_cb(struct nfsd4_callback *cb) @@ -743,7 +763,6 @@ static void do_probe_callback(struct nfs4_client *clp) cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; cb->cb_msg.rpc_argp = NULL; cb->cb_msg.rpc_resp = NULL; - cb->cb_msg.rpc_cred = callback_cred; cb->cb_ops = &nfsd4_cb_probe_ops; @@ -962,6 +981,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) if (clp->cl_cb_client) { rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; + put_rpccred(clp->cl_cb_cred); + clp->cl_cb_cred = NULL; } if (clp->cl_cb_conn.cb_xprt) { svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -1010,6 +1031,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) nfsd4_release_cb(cb); return; } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, cb->cb_ops, cb); } @@ -1025,7 +1047,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp) cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; - cb->cb_msg.rpc_cred = callback_cred; cb->cb_ops = &nfsd4_cb_recall_ops; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index db7258c13423..dbbbd2fe5236 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -957,6 +957,7 @@ void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4 new->se_cb_seq_nr = 1; new->se_flags = cses->flags; new->se_cb_prog = cses->callback_prog; + new->se_cb_sec = cses->cb_sec; kref_init(&new->se_ref); idx = hash_sessionid(&new->se_sessionid); spin_lock(&client_lock); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index df33e781f36c..bff856c34a32 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -250,6 +250,7 @@ struct nfs4_client { #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) unsigned long cl_flags; + struct rpc_cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; #define NFSD4_CB_UP 0 -- cgit v1.2.1 From cb73a9f4649bf63c0397e565a15abf8a91ecf56f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 1 Nov 2012 18:09:48 -0400 Subject: nfsd4: implement backchannel_ctl operation This operation is mandatory for servers to implement. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 6 ++++++ fs/nfsd/nfs4state.c | 14 ++++++++++++++ fs/nfsd/nfs4xdr.c | 13 ++++++++++++- fs/nfsd/state.h | 5 +++++ fs/nfsd/xdr4.h | 2 ++ 5 files changed, 39 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6c9a4b291dba..f955176f1b6f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1666,6 +1666,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_EXCHANGE_ID", .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, }, + [OP_BACKCHANNEL_CTL] = { + .op_func = (nfsd4op_func)nfsd4_backchannel_ctl, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_BACKCHANNEL_CTL", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_BIND_CONN_TO_SESSION] = { .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index dbbbd2fe5236..4023e77687ee 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1865,6 +1865,20 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir) return nfserr_inval; } +__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc) +{ + struct nfsd4_session *session = cstate->session; + + spin_lock(&client_lock); + session->se_cb_prog = bc->bc_cb_program; + session->se_cb_sec = bc->bc_cb_sec; + spin_unlock(&client_lock); + + nfsd4_probe_callback(session->se_client); + + return nfs_ok; +} + __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_bind_conn_to_session *bcts) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 511f980b605c..d7e7c110246e 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -483,6 +483,17 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ DECODE_TAIL; } +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(bc->bc_cb_program); + nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) { DECODE_HEAD; @@ -1536,7 +1547,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp, /* new operations for NFSv4.1 */ - [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index bff856c34a32..758bc9c2646b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -166,6 +166,11 @@ struct nfsd4_create_session { struct nfsd4_cb_sec cb_sec; }; +struct nfsd4_backchannel_ctl { + u32 bc_cb_program; + struct nfsd4_cb_sec bc_cb_sec; +}; + struct nfsd4_bind_conn_to_session { struct nfs4_sessionid sessionid; u32 dir; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index acd127d4ee82..71c5c47f2750 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -462,6 +462,7 @@ struct nfsd4_op { /* NFSv4.1 */ struct nfsd4_exchange_id exchange_id; + struct nfsd4_backchannel_ctl backchannel_ctl; struct nfsd4_bind_conn_to_session bind_conn_to_session; struct nfsd4_create_session create_session; struct nfsd4_destroy_session destroy_session; @@ -566,6 +567,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); extern __be32 nfsd4_create_session(struct svc_rqst *, struct nfsd4_compound_state *, -- cgit v1.2.1 From 57725155dc1b8c78b7a96886d5cdc69dc89e9c54 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 5 Nov 2012 15:10:26 -0500 Subject: nfsd4: common helper to initialize callback work I've found it confusing having the only references to nfsd4_do_callback_rpc() in a different file. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 7 ++++++- fs/nfsd/nfs4state.c | 4 ++-- fs/nfsd/state.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index b32639ee0a42..a1aa18db08fb 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1016,7 +1016,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) run_nfsd4_cb(cb); } -void nfsd4_do_callback_rpc(struct work_struct *w) +static void nfsd4_do_callback_rpc(struct work_struct *w) { struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; @@ -1036,6 +1036,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w) cb->cb_ops, cb); } +void nfsd4_init_callback(struct nfsd4_callback *cb) +{ + INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc); +} + void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4023e77687ee..13f3471b02a2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -340,7 +340,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); - INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); + nfsd4_init_callback(&dp->dl_recall); return dp; } @@ -1313,7 +1313,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_callbacks); spin_lock_init(&clp->cl_lock); - INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); + nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 758bc9c2646b..0fd342a2174e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -470,10 +470,10 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); extern void nfs4_free_openowner(struct nfs4_openowner *); extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); +extern void nfsd4_init_callback(struct nfsd4_callback *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); -extern void nfsd4_do_callback_rpc(struct work_struct *); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); -- cgit v1.2.1 From 12fc3e92d4b18b4e99af624586e1696479ff36ce Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 5 Nov 2012 16:01:48 -0500 Subject: nfsd4: backchannel should use client-provided security flavor For now this only adds support for AUTH_NULL. (Previously we assumed AUTH_UNIX.) We'll also need AUTH_GSS, which is trickier. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 3 +-- fs/nfsd/nfs4xdr.c | 14 +++++++++++--- fs/nfsd/state.h | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a1aa18db08fb..7bb187ac1492 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -692,7 +692,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = XPRT_TRANSPORT_BC_TCP; - args.authflavor = RPC_AUTH_UNIX; + args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ client = rpc_create(&args); @@ -709,7 +709,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c clp->cl_cb_client = client; clp->cl_cb_cred = cred; return 0; - } static void warn_no_callback_path(struct nfs4_client *clp, int reason) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d7e7c110246e..406d0c4620f6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -425,7 +425,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) { DECODE_HEAD; - u32 dummy; + u32 dummy, uid, gid; char *machine_name; int i; int nr_secflavs; @@ -433,12 +433,15 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* callback_sec_params4 */ READ_BUF(4); READ32(nr_secflavs); + cbs->flavor = (u32)(-1); for (i = 0; i < nr_secflavs; ++i) { READ_BUF(4); READ32(dummy); switch (dummy) { case RPC_AUTH_NULL: /* Nothing to read */ + if (cbs->flavor == (u32)(-1)) + cbs->flavor = RPC_AUTH_NULL; break; case RPC_AUTH_UNIX: READ_BUF(8); @@ -452,13 +455,18 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* uid, gid */ READ_BUF(8); - READ32(cbs->uid); - READ32(cbs->gid); + READ32(uid); + READ32(gid); /* more gids */ READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); + if (cbs->flavor == (u32)(-1)) { + cbs->uid = uid; + cbs->gid = gid; + cbs->flavor = RPC_AUTH_UNIX; + } break; case RPC_AUTH_GSS: dprintk("RPC_AUTH_GSS callback secflavor " diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0fd342a2174e..0498053b8f0e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -151,6 +151,7 @@ struct nfsd4_channel_attrs { }; struct nfsd4_cb_sec { + u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */ u32 uid; u32 gid; }; -- cgit v1.2.1 From e4bc6522d53b7b8eb02cfac35fd18275fd86269d Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 30 Oct 2012 19:52:40 +0800 Subject: eCryptfs: Avoid unnecessary disk read and data decryption during writing ecryptfs_write_begin grabs a page from page cache for writing. If the page contains invalid data, or data older than the counterpart on the disk, eCryptfs will read out the corresponing data from the disk into the page, decrypt them, then perform writing. However, for this page, if the length of the data to be written into is equal to page size, that means the whole page of data will be overwritten, in which case, it does not matter whatever the data were before, it is beneficial to perform writing directly rather than bothering to read and decrypt first. With this optimization, according to our test on a machine with Intel Core 2 Duo processor, iozone 'write' operation on an existing file with write size being multiple of page size will enjoy a steady 3x speedup. Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Signed-off-by: Tyler Hicks --- fs/ecryptfs/mmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index bd1d57f98f74..564a1fa34b99 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file, if (prev_page_end_size >= i_size_read(page->mapping->host)) { zero_user(page, 0, PAGE_CACHE_SIZE); - } else { + SetPageUptodate(page); + } else if (len < PAGE_CACHE_SIZE) { rc = ecryptfs_decrypt_page(page); if (rc) { printk(KERN_ERR "%s: Error decrypting " @@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file, ClearPageUptodate(page); goto out; } + SetPageUptodate(page); } - SetPageUptodate(page); } } /* If creating a page or more of holes, zero them out via truncate. @@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file, } goto out; } + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + rc = 0; + goto out; + } + SetPageUptodate(page); + } /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { -- cgit v1.2.1 From b72f78cb63fb595af63fc781dced0a6fd354e572 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 10:33:36 -0500 Subject: ext4: fix overhead calculations in ext4_stats, again "overhead" was a write-only variable in this function after commit 952fc18e; we set it to 0 for minixdf, or to sbi->s_overhead if !minixdf, but never read it again after that. We need to use it, not sbi->s_overhead, when subtracting out overhead for f_blocks, or we get the wrong answer for minixdf. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80928f716850..1982d3cd9139 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4790,7 +4790,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ -- cgit v1.2.1 From 6d138ced751d4e41e02c38ad55d1b3cd2913b150 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 11:11:59 -0500 Subject: ext4: fix awful goto in ext4_mb_new_blocks() I think the whole function could be made prettier, but that goto really took the cake for too-clever-by-half. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 526e55358606..27f421c8043d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4310,8 +4310,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) + if (*errp) { + ext4_discard_allocated_blocks(ac); goto errout; + } /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,10 +4335,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) - errout: + } else if (*errp) { ext4_discard_allocated_blocks(ac); - else { + goto errout; + } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4347,6 +4349,7 @@ repeat: *errp = -ENOSPC; } +errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; -- cgit v1.2.1 From 37be2f59d3149b95afaeeeff94edde2c07f165d2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 11:22:46 -0500 Subject: ext4: remove ext4_handle_release_buffer() ext4_handle_release_buffer() was intended to remove journal write access from a buffer, but it doesn't actually do anything at all other than add a BUFFER_TRACE point, but it's not reliably used for that either. Remove all the associated dead code. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Reviewed-by: Carlos Maiolino --- fs/ext4/ext4_jbd2.h | 7 ------- fs/ext4/resize.c | 17 +++-------------- fs/ext4/xattr.c | 1 - fs/jbd2/journal.c | 1 - fs/jbd2/transaction.c | 11 ----------- 5 files changed, 3 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 56d258c18303..7177f9b21cb2 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle) handle->h_sync = 1; } -static inline void ext4_handle_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 47bf06a2765d..d99387b89edd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_sbh; + goto exit_dind; err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) @@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dindj; + goto exit_dind; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, exit_inode: ext4_kvfree(n_group_desc); - /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); -exit_dindj: - /* ext4_handle_release_buffer(handle, dind); */ -exit_sbh: - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext4_handle_release_buffer(handle, primary[j]); - */ + if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; - } } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2cdb98d62980..b1adda1b750d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -794,7 +794,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 484b8d1c6cb6..dbf41f9452db 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); -EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); #if 0 EXPORT_SYMBOL(journal_sync_buffer); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a74ba4659549..deffd945c8e2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1207,17 +1207,6 @@ out: return ret; } -/* - * jbd2_journal_release_buffer: undo a get_write_access without any buffer - * updates, if the update decided in the end that it didn't need access. - * - */ -void -jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) -{ - BUFFER_TRACE(bh, "entry"); -} - /** * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle -- cgit v1.2.1 From d339450ccad1acb942fc880ca0b44c956e6d2762 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 8 Nov 2012 12:07:33 -0500 Subject: ext4: get rid of redundant code in ext4_fill_super() Signed-off-by: Zhao Hongjiang Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1982d3cd9139..ea21231633eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3272,9 +3272,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb = sb; - sbi->s_mount_opt = 0; - sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) -- cgit v1.2.1 From 7e9620f21d8c9e389fd6845487e07d5df898a2e4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e7..4dad756962d0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2387,14 +2387,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; -- cgit v1.2.1 From 408cc4e97a3ccd172d2d676e4b585badf439271b Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + fs/xfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_ialloc.c | 1 + 3 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe3..0287f3b1b503 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707b..e1545ec2f7d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31c..c5c4ef4f2bdb 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; -- cgit v1.2.1 From 326c03555b914ff153ba5b40df87fd6e28e7e367 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b503..43f791bcd8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca1..ef7d4885dc2d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d2..91259554df8b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce16..b68c598034c1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad935..7f537663365b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -584,7 +584,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; -- cgit v1.2.1 From 1f3c785c3adb7d2b109ec7c8f10081d1294b03d3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b1..335206a9c698 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2d..feacb061bab7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8b..83d0cf3df930 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c1..5f469c3516eb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* -- cgit v1.2.1 From eaef854335ce09956e930fe4a193327417edc6c9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 9 Oct 2012 14:50:52 +1100 Subject: xfs: growfs: don't read garbage for new secondary superblocks When updating new secondary superblocks in a growfs operation, the superblock buffer is read from the newly grown region of the underlying device. This is not guaranteed to be zero, so violates the underlying assumption that the unused parts of superblocks are zero filled. Get a new buffer for these secondary superblocks to ensure that the unused regions are zero filled correctly. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf7..4beaede43277 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -399,9 +399,26 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -423,7 +440,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); -- cgit v1.2.1 From 1e7acbb7bc1ae7c1c62fd1310b3176a820225056 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Oct 2012 17:22:30 +1100 Subject: xfs: silence uninitialised f.file warning. Uninitialised variable build warning introduced by 2903ff0 ("switch simple cases of fget_light to fdget"), gcc is not smart enough to work out that the variable is not used uninitialised, and the commit removed the initialisation at declaration that the old variable had. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac6773..c1df3c623de2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; -- cgit v1.2.1 From ca250b1b3d711936d7dae9e97871f2261347f82d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:41 +1100 Subject: xfs: invalidate allocbt blocks moved to the free list When we free a block from the alloc btree tree, we move it to the freelist held in the AGFL and mark it busy in the busy extent tree. This typically happens when we merge btree blocks. Once the transaction is committed and checkpointed, the block can remain on the free list for an indefinite amount of time. Now, this isn't the end of the world at this point - if the free list is shortened, the buffer is invalidated in the transaction that moves it back to free space. If the buffer is allocated as metadata from the free list, then all the modifications getted logged, and we have no issues, either. And if it gets allocated as userdata direct from the freelist, it gets invalidated and so will never get written. However, during the time it sits on the free list, pressure on the log can cause the AIL to be pushed and the buffer that covers the block gets pushed for write. IOWs, we end up writing a freed metadata block to disk. Again, this isn't the end of the world because we know from the above we are only writing to free space. The problem, however, is for validation callbacks. If the block was on old btree root block, then the level of the block is going to be higher than the current tree root, and so will fail validation. There may be other inconsistencies in the block as well, and currently we don't care because the block is in free space. Shutting down the filesystem because a freed block doesn't pass write validation, OTOH, is rather unfriendly. So, make sure we always invalidate buffers as they move from the free space trees to the free list so that we guarantee they never get written to disk while on the free list. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8f..f7876c6d6165 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } -- cgit v1.2.1 From 4b62acfe99e158fb7812982d1cf90a075710a92c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:42 +1100 Subject: xfs: don't vmap inode cluster buffers during free Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa2..1938b41ee9f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1509,7 +1509,8 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; -- cgit v1.2.1 From 03b1293edad462ad1ad62bcc5160c76758e450d5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 14:23:12 +1100 Subject: xfs: fix buffer shudown reference count mismatch When we shut down the filesystem, we have to unpin and free all the buffers currently active in the CIL. To do this we unpin and remove them in one operation as a result of a failed iclogbuf write. For buffers, we do this removal via a simultated IO completion of after marking the buffer stale. At the time we do this, we have two references to the buffer - the active LRU reference and the buf log item. The LRU reference is removed by marking the buffer stale, and the active CIL reference is by the xfs_buf_iodone() callback that is run by xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone callback). However, ioend processing requires one more reference - that of the IO that it is completing. We don't have this reference, so we free the buffer prematurely and use it after it is freed. For buffers marked with XBF_ASYNC, this leads to assert failures in xfs_buf_rele() on debug kernels because the b_hold count is zero. Fix this by making sure we take the necessary IO reference before starting IO completion processing on the stale buffer, and set the XBF_ASYNC flag to ensure that IO completion processing removes all the active references from the buffer to ensure it is fully torn down. Cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed911196..becf4a97efc6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); -- cgit v1.2.1 From 6ce377afd1755eae5c93410ca9a1121dfead7b87 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:44 +1100 Subject: xfs: fix reading of wrapped log data Commit 4439647 ("xfs: reset buffer pointers before freeing them") in 3.0-rc1 introduced a regression when recovering log buffers that wrapped around the end of log. The second part of the log buffer at the start of the physical log was being read into the header buffer rather than the data buffer, and hence recovery was seeing garbage in the data buffer when it got to the region of the log buffer that was incorrectly read. Cc: # 3.0.x, 3.2.x, 3.4.x 3.6.x Reported-by: Torsten Kaiser Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352bf..d308749fabf1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3541,7 +3541,7 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; -- cgit v1.2.1 From d8ec0c396083ef633a065629df1565246dbb2f33 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 8 Nov 2012 12:19:58 -0500 Subject: ext4: remove unused assignment Signed-off-by: Alan Cox Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 27f421c8043d..442caae80a98 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; -- cgit v1.2.1 From 79add3a3f795e688e35d5e703d5a8cfa8ef923ac Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 13:28:29 -0500 Subject: ext4: notify when discard is not supported Notify user when mounting the file system with -o discard option, but the device does not support discard. Obviously we do not want to fail the mount or disable the options, because the underlying device might change in future even without file system remount. Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ea21231633eb..6729470ee1a4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4015,6 +4015,14 @@ no_journal: } #endif /* CONFIG_QUOTA */ + if (test_opt(sb, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); -- cgit v1.2.1 From d71c1ae23aa3e7822715c63dc242de6d73002541 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 14:04:52 -0500 Subject: ext4: warn when discard request fails other than EOPNOTSUPP We should warn user then the discard request fails. However we need to exclude -EOPNOTSUPP case since parts of the device might not support it while other parts can. So print the kernel warning when the error != -EOPNOTSUPP is returned from ext4_issue_discard(). We should also handle error cases in batched discard, again excluding EOPNOTSUPP. Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 442caae80a98..1bf6fe785c4f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, entry->efd_count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4659,8 +4667,16 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } + + ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4854,10 +4870,11 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, +static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; + int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -4873,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; } /** @@ -4904,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret; + int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); @@ -4931,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; count += next - start; } free_count += next - start; @@ -4953,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, break; } - if (!ret) + if (!ret) { + ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4962,7 +4985,7 @@ out: ext4_debug("trimmed %d blocks in the group %d\n", count, group); - return count; + return ret; } /** -- cgit v1.2.1 From b5645534ce84c21695c2f82d4d4f67cf2a67229a Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:33:43 -0500 Subject: ext4: print 'flags' in ext4_ext_handle_uninitialized_extents In trace_ext4_ext_handle_uninitialized_extents we don't care about the value of map->m_flags because this value is probably 0, and we prefer to get the value of flags because we can know how to handle this extent in this function. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7011ac967208..59e6e12e0029 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3663,8 +3663,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, - newblock); + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + allocated, newblock); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { -- cgit v1.2.1 From 19b303d8b5a0e8150a4697c01ca03e75a0a17469 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:34:04 -0500 Subject: ext4: print map->m_flags in trace_ext4_ext/ind_map_blocks_exit When we use trace_ext4_ext/ind_map_blocks_exit, print the value of map->m_flags in order that we can understand the extent's current status. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +-- fs/ext4/indirect.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 59e6e12e0029..7a64c193b2af 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4284,8 +4284,7 @@ out2: kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); + trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); return err ? err : allocated; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 792e388e7b44..292337f27c9c 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -755,8 +755,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); + trace_ext4_ind_map_blocks_exit(inode, map, err); return err; } -- cgit v1.2.1 From 37794732467dd998a34bfce19738ad3ef1f37507 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:47:52 -0500 Subject: ext4: fix missing call to trace_ext4_ext_map_blocks_exit When ext4_ext_handle_uninitialized_extents(), we will directly return from ext4_ext_map_blocks(). The trace point of trace_ext4_ext_map_blocks_exit isn't called, and the user doesn't see any result. This patch tries to fix this problem. Meanwhile in ext4_ext_handle_uninitialized_extents it returns errors or the number of allocated blocks. So 'ret' variable can be removed due to previously modifications. Signed-off-by: Zheng Liu --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7a64c193b2af..dce97de6a409 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3911,7 +3911,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; + int free_on_err = 0, err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4007,10 +4007,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ee_len, ee_start); goto out; } - ret = ext4_ext_handle_uninitialized_extents( + allocated = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - return ret; + goto out3; } } @@ -4284,6 +4284,7 @@ out2: kfree(path); } +out3: trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); return err ? err : allocated; -- cgit v1.2.1 From 8d8c1825709020c73b5e66f96c114f6a1f6461e7 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 8 Nov 2012 14:53:35 -0500 Subject: ext4: use 'inode' variable that is already dereferenced Tested: xfs tests Reviewed-by: Zheng Liu Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 68e896e12a67..0fd16e653ebd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -111,7 +111,7 @@ static int ext4_end_io(ext4_io_end_t *io) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); + wake_up_all(ext4_ioend_wq(inode)); return ret; } -- cgit v1.2.1 From 8b0f165f790c897fa744e7fed6f0bfeb6eb6f494 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 8 Nov 2012 15:07:16 -0500 Subject: ext4: remove code duplication in ext4_get_block_write_nolock() 729f52c6be51013 introduced function ext4_get_block_write_nolock() that is very similar to _ext4_get_block(). Eliminate code duplication by passing different flags to _ext4_get_block() Tested: xfs tests Reviewed-by: Zheng Liu Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 63 ++++++++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3c243b9afa5..f84bfd6d1867 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -683,7 +683,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -880,6 +880,8 @@ static int do_journal_get_write_access(handle_t *handle, static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2850,29 +2852,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, } static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) + struct buffer_head *bh_result, int create) { - handle_t *handle = ext4_journal_current_handle(); - struct ext4_map_blocks map; - int ret = 0; - - ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n", - inode->i_ino, flags); - - flags = EXT4_GET_BLOCKS_NO_LOCK; - - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; - bh_result->b_size = inode->i_sb->s_blocksize * map.m_len; - ret = 0; - } - return ret; + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_NO_LOCK); } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3003,6 +2988,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, loff_t final_size = offset + count; if (rw == WRITE && final_size <= inode->i_size) { int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; BUG_ON(iocb->private == NULL); @@ -3056,22 +3043,20 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_inode_aio_set(inode, io_end); } - if (overwrite) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write_nolock, - ext4_end_io_dio, - NULL, - 0); - else - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write, - ext4_end_io_dio, - NULL, - DIO_LOCKING); + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + if (iocb->private) ext4_inode_aio_set(inode, NULL); /* -- cgit v1.2.1 From 24ec19b0ae83a385ad9c55520716da671274b96c Mon Sep 17 00:00:00 2001 From: Eugene Shatokhin Date: Thu, 8 Nov 2012 15:11:11 -0500 Subject: ext4: fix memory leak in ext4_xattr_set_acl()'s error path In ext4_xattr_set_acl(), if ext4_journal_start() returns an error, posix_acl_release() will not be called for 'acl' which may result in a memory leak. This patch fixes that. Reviewed-by: Lukas Czerner Signed-off-by: Eugene Shatokhin Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/acl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index d3c5b88fd89f..e6e0d988439b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, retry: handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto release_and_out; + } error = ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.1 From 07aa2ea13814ea60d12f7330b6d5ccfdb0c3ba4d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 15:16:54 -0500 Subject: ext4: fix error handling in ext4_fill_super() There are some places in ext4_fill_super() where we would not return proper error code if something fails. The confusion is caused probably due to the fact that we have two "kind-of" return variables 'ret'and 'err'. 'ret' is used to return error code from ext4_fill_super() where err is used to store return values from other functions within ext4_fill_super(). However some places were missing the obligatory 'ret = err'. We could put the assignment where it is missing, but we can have better "future proof" solution. Or we could convert the code to use just one, but it would require more rewrites. This commit fixes the problem by returning value from 'err' variable if it is set and 'ret' otherwise in error handling branch of the ext4_fill_super(). The reasoning is that 'ret' value is often set to default "-EINVAL" or explicit value, where 'err' is used to store return value from other functions and should be otherwise zero. https://bugzilla.kernel.org/show_bug.cgi?id=48431 Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6729470ee1a4..18e89fafebd1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3256,7 +3256,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int i; int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; - int err; + int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; @@ -3282,6 +3282,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { @@ -3659,7 +3660,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = err; goto failed_mount; } @@ -3767,7 +3767,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); - ret = err; goto failed_mount3; } @@ -3894,8 +3893,8 @@ no_journal: if (es->s_overhead_clusters) sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); else { - ret = ext4_calculate_overhead(sb); - if (ret) + err = ext4_calculate_overhead(sb); + if (err) goto failed_mount_wq; } @@ -3907,6 +3906,7 @@ no_journal: alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + ret = -ENOMEM; goto failed_mount_wq; } @@ -4009,8 +4009,8 @@ no_journal: /* Enable quota usage during mount. */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && !(sb->s_flags & MS_RDONLY)) { - ret = ext4_enable_quotas(sb); - if (ret) + err = ext4_enable_quotas(sb); + if (err) goto failed_mount7; } #endif /* CONFIG_QUOTA */ @@ -4089,7 +4089,7 @@ out_fail: kfree(sbi); out_free_orig: kfree(orig_data); - return ret; + return err ? err : ret; } /* -- cgit v1.2.1 From c0677e6d0f9d991adff972b8d06cb83de1f8ee8e Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 15:18:54 -0500 Subject: ext4: add data structures for the extent status tree This patch adds two structures that supports extent status tree, extent_status and ext4_es_tree. Currently extent_status is used to track a delay extent for an inode, which record the start block and the length of the delay extent. ext4_es_tree is used to store all extent_status for an inode in memory. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ++++++ fs/ext4/extents_status.h | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 fs/ext4/extents_status.h (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c20de1d59d0..bcc634b26d46 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -811,6 +811,8 @@ struct ext4_ext_cache { __u32 ec_len; /* must be 32bit to return holes */ }; +#include "extents_status.h" + /* * fourth extended file system inode data in memory */ @@ -888,6 +890,10 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + /* ialloc */ ext4_group_t i_last_alloc_group; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h new file mode 100644 index 000000000000..8be2ab9c9425 --- /dev/null +++ b/fs/ext4/extents_status.h @@ -0,0 +1,25 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t start; /* first block extent covers */ + ext4_lblk_t len; /* length of extent in block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +#endif /* _EXT4_EXTENTS_STATUS_H */ -- cgit v1.2.1 From 27b52867925e3aaed090063c1c58a7537e6373f3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:38 -0500 Subject: xfs: add EOFBLOCKS inode tagging/untagging Add the XFS_ICI_EOFBLOCKS_TAG inode tag to identify inodes with speculatively preallocated blocks beyond EOF. An inode is tagged when speculative preallocation occurs and untagged either via truncate down or when post-EOF blocks are freed via release or reclaim. The tag management is intentionally not aggressive to prefer simplicity over the complexity of handling all the corner cases under which post-EOF blocks could be freed (i.e., forward truncation, fallocate, write error conditions, etc.). This means that a tagged inode may or may not have post-EOF blocks after a period of time. The tag is eventually cleared when the inode is released or reclaimed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_icache.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 3 +++ fs/xfs/xfs_iomap.c | 8 +++++++ fs/xfs/xfs_iops.c | 4 ++++ fs/xfs/xfs_trace.h | 5 +++++ fs/xfs/xfs_vnodeops.c | 2 ++ 7 files changed, 85 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c0..22bd4db011c8 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -233,6 +233,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9c8703b5cd72..f9afc5ff0482 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1128,3 +1128,65 @@ xfs_reclaim_inodes_count( return reclaimable; } +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 222e22f16b4a..db3613075dc6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -35,6 +35,9 @@ void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); + int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a066cf1766ab..add06b4e9a63 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -41,6 +41,7 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -450,6 +451,13 @@ retry: if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 4e00cf091d2c..81f5c4953287 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -854,6 +855,9 @@ xfs_setattr_size( * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); } if (mask & ATTR_CTIME) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7d36ccf57f93..6f46e034b766 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -130,6 +130,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, @@ -585,6 +587,9 @@ DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2ee1f49da0aa..e6e1d11dfdf2 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -238,6 +238,8 @@ xfs_free_eofblocks( } else { error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); -- cgit v1.2.1 From a454f7428ffa03c8e1321124d9074101b7290be6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:39 -0500 Subject: xfs: support a tag-based inode_ag_iterator Genericize xfs_inode_ag_walk() to support an optional radix tree tag and args argument for the execute function. Create a new wrapper called xfs_inode_ag_iterator_tag() that performs a tag based walk of perag's and inodes. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 56 ++++++++++++++++++++++++++++++++++++++++++------ fs/xfs/xfs_icache.h | 9 ++++++-- fs/xfs/xfs_qm_syscalls.c | 5 +++-- 3 files changed, 59 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f9afc5ff0482..2a96dc48ebe6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -516,8 +516,11 @@ xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { uint32_t first_index; int last_error = 0; @@ -536,9 +539,17 @@ restart: int i; rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + if (!nr_found) { rcu_read_unlock(); break; @@ -579,7 +590,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags); + error = execute(batch[i], pag, flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -608,8 +619,10 @@ int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args) { struct xfs_perag *pag; int error = 0; @@ -619,7 +632,36 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index db3613075dc6..54c113478dfc 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,7 +40,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args, int tag); #endif diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7a9071f8855f..5f53e75409b8 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -846,7 +846,8 @@ STATIC int xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int flags, + void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || @@ -882,5 +883,5 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); } -- cgit v1.2.1 From 72b53efa4a6125a4c334871c58268c430605819a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:40 -0500 Subject: xfs: create helper to check whether to free eofblocks on inode This check is used in multiple places to determine whether we should check for (and potentially free) post EOF blocks on an inode. Add a helper to consolidate the check. Note that when we remove an inode from the cache (xfs_inactive()), we are required to trim post-EOF blocks even if the inode is marked preallocated or append-only to maintain correct space accounting. The 'force' parameter to xfs_can_free_eofblocks() specifies whether we should ignore the prealloc/append-only status of the inode. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_vnodeops.c | 19 +++++++------------ 3 files changed, 45 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 965598eb308c..7449cb943efd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3912,3 +3912,40 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1fc2065e010b..21b4de3df716 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -585,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *); void xfs_iext_irec_compact_pages(xfs_ifork_t *); void xfs_iext_irec_compact_full(xfs_ifork_t *); void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +bool xfs_can_free_eofblocks(struct xfs_inode *, bool); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e6e1d11dfdf2..c4c153900205 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -436,11 +436,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (xfs_can_free_eofblocks(ip, false)) { /* * If we can't get the iolock just skip truncating the blocks @@ -516,13 +512,12 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - ip->i_delayed_blks != 0))) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; -- cgit v1.2.1 From 40165e27617e2a98bf8588001d2f2872fae2fee2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:41 -0500 Subject: xfs: make xfs_free_eofblocks() non-static, return EAGAIN on trylock failure Turn xfs_free_eofblocks() into a non-static function, return EAGAIN to indicate trylock failure and make sure this error is not propagated in xfs_release(). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 6 +++--- fs/xfs/xfs_vnodeops.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4c153900205..c2ddd7a43942 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -151,7 +151,7 @@ xfs_readlink( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -STATIC int +int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, @@ -200,7 +200,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); - return 0; + return EAGAIN; } } @@ -463,7 +463,7 @@ xfs_release( return 0; error = xfs_free_eofblocks(mp, ip, true); - if (error) + if (error && error != EAGAIN) return error; /* delalloc blocks after truncation means it really is dirty */ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 447e146b2ba6..52fafc416a0c 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -57,5 +57,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); #endif /* _XFS_VNODEOPS_H */ -- cgit v1.2.1 From 41176a68e3f710630feace536d0277a092e206b5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:42 -0500 Subject: xfs: create function to scan and clear EOFBLOCKS inodes xfs_inodes_free_eofblocks() implements scanning functionality for EOFBLOCKS inodes. It uses the AG iterator to walk the tagged inodes and free post-EOF blocks via the xfs_inode_free_eofblocks() execute function. The scan can be invoked in best-effort mode or wait (force) mode. A best-effort scan (default) handles all inodes that do not have a dirty cache and we successfully acquire the io lock via trylock. In wait mode, we continue to cycle through an AG until all inodes are handled. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + 3 files changed, 45 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2a96dc48ebe6..d115cb44b103 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,49 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags, + void *args) +{ + int ret; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + int flags) +{ + ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + NULL, XFS_ICI_EOFBLOCKS_TAG); +} + void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 54c113478dfc..cb6b8d0eee61 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,6 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, int); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6f46e034b766..cb5234632072 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -589,6 +589,7 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), -- cgit v1.2.1 From 8ca149de80478441352a8622ea15fae7de703ced Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:12 -0500 Subject: xfs: add XFS_IOC_FREE_EOFBLOCKS ioctl The XFS_IOC_FREE_EOFBLOCKS ioctl allows users to invoke an EOFBLOCKS scan. The xfs_eofblocks structure is defined to support the command parameters (scan mode). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 17 +++++++++++++++++ fs/xfs/xfs_icache.c | 10 +++++++--- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_ioctl.c | 20 ++++++++++++++++++++ 4 files changed, 45 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0948c043443b..0cfa30813b16 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -339,6 +339,22 @@ typedef struct xfs_error_injection { } xfs_error_injection_t; +/* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + __u64 pad[15]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC) + + /* * The user-level Handle Request interface structure. */ @@ -457,6 +473,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d115cb44b103..fbb74c715266 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1206,11 +1206,15 @@ xfs_inode_free_eofblocks( int xfs_icache_free_eofblocks( struct xfs_mount *mp, - int flags) + struct xfs_eofblocks *eofb) { - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - NULL, XFS_ICI_EOFBLOCKS_TAG); + eofb, XFS_ICI_EOFBLOCKS_TAG); } void diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index cb6b8d0eee61..4934a77024cf 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,7 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, int); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1df3c623de2..5b20ab0b4f9d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -1602,6 +1603,25 @@ xfs_file_ioctl( error = xfs_errortag_clearall(mp, 1); return -error; + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_eofblocks eofb; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) + return -XFS_ERROR(EINVAL); + + if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) + return -XFS_ERROR(EINVAL); + + if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + return -XFS_ERROR(EINVAL); + + error = xfs_icache_free_eofblocks(mp, &eofb); + return -error; + } + default: return -ENOTTY; } -- cgit v1.2.1 From 3e3f9f5863548e870edfcc72e7617ac8ddcad44a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:13 -0500 Subject: xfs: add inode id filtering to eofblocks scan Support inode ID filtering in the eofblocks scan. The caller must set the associated XFS_EOF_FLAGS_*ID bit and ID field. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 14 ++++++++++++-- fs/xfs/xfs_icache.c | 22 ++++++++++++++++++++++ fs/xfs/xfs_ioctl.c | 3 ++- 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0cfa30813b16..a19f9b205c15 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -346,13 +346,23 @@ typedef struct xfs_error_injection { struct xfs_eofblocks { __u32 eof_version; __u32 eof_flags; - __u64 pad[15]; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 pad64[13]; }; /* eof_flags values */ #define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ #define XFS_EOF_FLAGS_VALID \ - (XFS_EOF_FLAGS_SYNC) + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fbb74c715266..b239da91c43b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,21 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if (eofb->eof_flags & XFS_EOF_FLAGS_UID) + return ip->i_d.di_uid == eofb->eof_uid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) + return ip->i_d.di_gid == eofb->eof_gid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) + return xfs_get_projid(ip) == eofb->eof_prid; + + return 0; +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1178,6 +1193,7 @@ xfs_inode_free_eofblocks( void *args) { int ret; + struct xfs_eofblocks *eofb = args; if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1194,6 +1210,12 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; + if (eofb && + (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| + XFS_EOF_FLAGS_PRID)) && + !xfs_inode_match_id(ip, eofb)) + return 0; + ret = xfs_free_eofblocks(ip->i_mount, ip, true); /* don't revisit the inode if we're not waiting */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5b20ab0b4f9d..c1c3ef88a260 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1615,7 +1615,8 @@ xfs_file_ioctl( if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) return -XFS_ERROR(EINVAL); - if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || + memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) return -XFS_ERROR(EINVAL); error = xfs_icache_free_eofblocks(mp, &eofb); -- cgit v1.2.1 From 1b5560488d1ab7c932f6f99385b41116838c3486 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:45 -0500 Subject: xfs: support multiple inode id filtering in eofblocks scan Enhance the eofblocks scan code to filter based on multiply specified inode id values. When multiple inode id values are specified, only inodes that match all id values are selected. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b239da91c43b..32908909815e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1175,14 +1175,19 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UID) - return ip->i_d.di_uid == eofb->eof_uid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) - return ip->i_d.di_gid == eofb->eof_gid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) - return xfs_get_projid(ip) == eofb->eof_prid; + if (eofb->eof_flags & XFS_EOF_FLAGS_UID && + ip->i_d.di_uid != eofb->eof_uid) + return 0; - return 0; + if (eofb->eof_flags & XFS_EOF_FLAGS_GID && + ip->i_d.di_gid != eofb->eof_gid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; } STATIC int @@ -1210,10 +1215,7 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && - (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| - XFS_EOF_FLAGS_PRID)) && - !xfs_inode_match_id(ip, eofb)) + if (eofb && !xfs_inode_match_id(ip, eofb)) return 0; ret = xfs_free_eofblocks(ip->i_mount, ip, true); -- cgit v1.2.1 From 00ca79a04bef1a1b30ef8afd992d905b6d986caf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:14 -0500 Subject: xfs: add minimum file size filtering to eofblocks scan Support minimum file size filtering in the eofblocks scan. The caller must set the XFS_EOF_FLAGS_MINFILESIZE flags bit and minimum file size value in bytes. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 7 +++++-- fs/xfs/xfs_icache.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index a19f9b205c15..6dda3f949b04 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -350,7 +350,8 @@ struct xfs_eofblocks { gid_t eof_gid; prid_t eof_prid; __u32 pad32; - __u64 pad64[13]; + __u64 eof_min_file_size; + __u64 pad64[12]; }; /* eof_flags values */ @@ -358,11 +359,13 @@ struct xfs_eofblocks { #define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ #define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ #define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ #define XFS_EOF_FLAGS_VALID \ (XFS_EOF_FLAGS_SYNC | \ XFS_EOF_FLAGS_UID | \ XFS_EOF_FLAGS_GID | \ - XFS_EOF_FLAGS_PRID) + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 32908909815e..906e6dcd2c55 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1215,8 +1215,15 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && !xfs_inode_match_id(ip, eofb)) - return 0; + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } ret = xfs_free_eofblocks(ip->i_mount, ip, true); -- cgit v1.2.1 From 579b62faa5fb16ffeeb88cda5e2c4e95730881af Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:47 -0500 Subject: xfs: add background scanning to clear eofblocks inodes Create a new mount workqueue and delayed_work to enable background scanning and freeing of eofblocks inodes. The scanner kicks in once speculative preallocation occurs and stops requeueing itself when no eofblocks inodes exist. The scan interval is based on the new 'speculative_prealloc_lifetime' tunable (default to 5m). The background scanner performs unfiltered, best effort scans (which skips inodes under lock contention or with a dirty cache mapping). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_globals.c | 4 +++- fs/xfs/xfs_icache.c | 29 +++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_mount.c | 2 ++ fs/xfs/xfs_mount.h | 3 +++ fs/xfs/xfs_super.c | 9 +++++++++ fs/xfs/xfs_sysctl.c | 9 +++++++++ fs/xfs/xfs_sysctl.h | 1 + 9 files changed, 58 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b9..5399ef222dd7 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,7 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -40,4 +41,5 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 906e6dcd2c55..96e344e3e927 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -615,6 +615,32 @@ restart: return last_error; } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'background_prealloc_discard_period' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1273,6 +1299,9 @@ xfs_inode_set_eofblocks_tag( XFS_ICI_EOFBLOCKS_TAG); spin_unlock(&ip->i_mount->m_perag_lock); + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 4934a77024cf..e0f138c70a2f 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -38,6 +38,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 828662f70d64..0a134ca5211c 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -118,6 +118,7 @@ #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f1c997704cd..41ae7e1590f5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1428,6 +1428,8 @@ xfs_unmountfs( __uint64_t resblks; int error; + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a631ca3b9065..dc306a09f56f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -196,6 +196,8 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ @@ -207,6 +209,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3d9ea947e9f8..ab8839b26272 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -874,8 +874,15 @@ xfs_init_mount_workqueues( if (!mp->m_log_workqueue) goto out_destroy_reclaim; + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + return 0; +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -892,6 +899,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); @@ -1393,6 +1401,7 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); mp->m_super = sb; sb->s_fs_info = mp; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2adaa438..2801b5ce6cdb 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -202,6 +202,15 @@ static ctl_table xfs_table[] = { .extra1 = &xfs_params.fstrm_timer.min, .extra2 = &xfs_params.fstrm_timer.max, }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d450f8e..bd8e157c20ef 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ } xfs_param_t; /* -- cgit v1.2.1 From 654598bef3731c9ae9b068ac35e6b69674c02841 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:20 -0500 Subject: ext4: add operations on extent status tree This patch adds operations on a extent status tree. CC: Lukas Czerner Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Hugh Dickins Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/extents_status.c | 492 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 20 ++ 3 files changed, 513 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/extents_status.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f865930..41f22be2ffa4 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o extents_status.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c new file mode 100644 index 000000000000..02c09be3d773 --- /dev/null +++ b/fs/ext4/extents_status.c @@ -0,0 +1,492 @@ +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Hugh Dickins + * Zheng Liu + * + * Ext4 extents status tree core functions. + */ +#include +#include "ext4.h" +#include "extents_status.h" +#include "ext4_extents.h" + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delay extent in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the following comment. But for better + * understand what it does, it has been rename to extent status tree. + * + * Currently the first step has been done. All delay extents are + * tracked in the tree. It maintains the delay extent when a delay + * allocation is issued, and the delay extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + */ + +/* + * extents status tree implementation for ext4. + * + * + * ========================================================================== + * Extents status encompass delayed extents and extent locks + * + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need + * to know if a block or a range of blocks are belonged to a delayed + * extent. + * + * Let us have a look at how they do without delayed extents implementation. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- punch hole + * punch hole looks up page cache to identify a delayed extent. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the delayed extent tree. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + * -- delayed extent + * A delayed extent is a range of blocks which are contiguous + * logically and under delayed allocation. Unlike extent in + * ext4, delayed extent in ext4 is a in-memory struct, there is + * no corresponding on-disk data. There is no limit on length of + * delayed extent, so a delayed extent can contain as many blocks + * as they are contiguous logically. + * + * -- delayed extent tree + * Every inode has a delayed extent tree and all under delayed + * allocation blocks are added to the tree as delayed extents. + * Delayed extents in the tree are ordered by logical block no. + * + * -- operations on a delayed extent tree + * There are three operations on a delayed extent tree: find next + * delayed extent, adding a space(a range of blocks) and removing + * a space. + * + * -- race on a delayed extent tree + * Delayed extent tree is protected inode->i_es_lock. + * + * + * ========================================================================== + * 3. performance analysis + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 4. TODO list + * -- Track all extent status + * + * -- Improve get block process + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +int __init ext4_init_es(void) +{ + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + if (ext4_es_cachep) + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t extent_status_end(struct extent_status *es) +{ + BUG_ON(es->start + es->len < es->start); + return es->start + es->len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t offset) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (offset < es->start) + node = node->rb_left; + else if (offset > extent_status_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && offset < es->start) + return es; + + if (es && offset > extent_status_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_extent: find the 1st delayed extent covering @es->start + * if it exists, otherwise, the next extent after @es->start. + * + * @inode: the inode which owns delayed extents + * @es: delayed extent that we found + * + * Returns the first block of the next extent after es, otherwise + * EXT_MAX_BLOCKS if no delay extent is found. + * Delayed extent is returned via @es. + */ +ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + ext4_lblk_t ret = EXT_MAX_BLOCKS; + + read_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + + /* find delay extent in cache firstly */ + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(es->start, es1->start, es1->len)) { + es_debug("%u cached by [%u/%u)\n", + es->start, es1->start, es1->len); + goto out; + } + } + + es->len = 0; + es1 = __es_tree_search(&tree->root, es->start); + +out: + if (es1) { + tree->cache_es = es1; + es->start = es1->start; + es->len = es1->len; + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + ret = es1->start; + } + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + return ret; +} + +static struct extent_status * +ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +{ + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) + return NULL; + es->start = start; + es->len = len; + return es; +} + +static void ext4_es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es->start == extent_status_end(es1) + 1) { + es1->len += es->len; + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->start == extent_status_end(es) + 1) { + es->len += es1->len; + rb_erase(node, &tree->root); + ext4_es_free_extent(es1); + } + + return es; +} + +static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + ext4_lblk_t end = offset + len - 1; + + BUG_ON(end < offset); + es = tree->cache_es; + if (es && offset == (extent_status_end(es) + 1)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } else if (es && es->start == end + 1) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } else if (es && es->start <= offset && + end <= extent_status_end(es)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + goto out; + } + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (offset < es->start) { + if (es->start == end + 1) { + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } + p = &(*p)->rb_left; + } else if (offset > extent_status_end(es)) { + if (offset == extent_status_end(es) + 1) { + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } + p = &(*p)->rb_right; + } else { + if (extent_status_end(es) <= end) + es->len = offset - es->start + len; + goto out; + } + } + + es = ext4_es_alloc_extent(offset, len); + if (!es) + return -ENOMEM; + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds a space to a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * ext4_es_insert_extent is called by ext4_da_write_begin and + * ext4_es_remove_extent. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct ext4_es_tree *tree; + int err = 0; + + es_debug("add [%u/%u) to extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + err = __es_insert_extent(tree, offset, len); + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node *node; + struct ext4_es_tree *tree; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2, end; + int err = 0; + + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + end = offset + len - 1; + BUG_ON(end < offset); + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + es = __es_tree_search(&tree->root, offset); + if (!es) + goto out; + if (es->start > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + + orig_es.start = es->start; + orig_es.len = es->len; + len1 = offset > es->start ? offset - es->start : 0; + len2 = extent_status_end(es) > end ? + extent_status_end(es) - end : 0; + if (len1 > 0) + es->len = len1; + if (len2 > 0) { + if (len1 > 0) { + err = __es_insert_extent(tree, end + 1, len2); + if (err) { + es->start = orig_es.start; + es->len = orig_es.len; + goto out; + } + } else { + es->start = end + 1; + es->len = len2; + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && extent_status_end(es) <= end) { + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->start < end + 1) { + len1 = extent_status_end(es) - end; + es->start = end + 1; + es->len = len1; + } + +out: + write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_print_tree(inode); + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8be2ab9c9425..077f82db092a 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -11,6 +11,15 @@ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + struct extent_status { struct rb_node rb_node; ext4_lblk_t start; /* first block extent covers */ @@ -22,4 +31,15 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, + struct extent_status *es); + #endif /* _EXT4_EXTENTS_STATUS_H */ -- cgit v1.2.1 From 9a26b66175e1c221f39bbe09e2e1d0a31a14ba6d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:30 -0500 Subject: ext4: initialize extent status tree Let ext4 initialize extent status tree of an inode. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 18e89fafebd1..6791d091fbc7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -943,6 +943,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; -- cgit v1.2.1 From 51865fda28e585bdcc164474ff6438a9ccdbfada Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:32 -0500 Subject: ext4: let ext4 maintain extent status tree This patch lets ext4 maintain extent status tree. Currently it only tracks delay extent status in extent status tree. When a delay allocation is issued, the related delay extent will be inserted into extent status tree. When a delay extent is written out or invalidated, it will be removed from this tree. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++++ fs/ext4/indirect.c | 1 + fs/ext4/inode.c | 38 +++++++++++++++++++++++++++++++++++--- fs/ext4/super.c | 12 +++++++++++- 4 files changed, 51 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dce97de6a409..67660fa2a7e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4344,6 +4344,8 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_es_remove_extent(inode, last_block, + EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final @@ -4971,6 +4973,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); + err = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); ext4_ext_invalidate_cache(inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 292337f27c9c..f6663c3a946d 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1411,6 +1411,7 @@ void ext4_ind_truncate(struct inode *inode) down_write(&ei->i_data_sem); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f84bfd6d1867..1e92349272e0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -574,7 +574,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + int ret; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + /* delayed alloc may be allocated by fallocate and + * coverted to initialized by directIO. + * we need to handle delayed extent here. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + goto delayed_mapped; + } + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -656,8 +665,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * set the BH_Da_Mapped bit on them. Its important to do this * under the protection of i_data_sem. */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret; set_buffers_da_mapped(inode, map); +delayed_mapped: + /* delayed allocation blocks has been allocated */ + ret = ext4_es_remove_extent(inode, map->m_lblk, + map->m_len); + if (ret < 0) + retval = ret; + } } up_write((&EXT4_I(inode)->i_data_sem)); @@ -1303,6 +1320,7 @@ static void ext4_da_page_release_reservation(struct page *page, struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int num_clusters; + ext4_fsblk_t lblk; head = page_buffers(page); bh = head; @@ -1317,11 +1335,15 @@ static void ext4_da_page_release_reservation(struct page *page, curr_off = next_off; } while ((bh = bh->b_this_page) != head); + if (to_release) { + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, lblk, to_release); + } + /* If we have released all the blocks belonging to a cluster, then we * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { - ext4_fsblk_t lblk; lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || @@ -1502,9 +1524,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + ext4_lblk_t start, last; index = mpd->first_page; end = mpd->next_page - 1; + + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -1816,6 +1844,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, goto out_unlock; } + retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); + if (retval) + goto out_unlock; + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6791d091fbc7..ad6cd8aeb946 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -50,6 +50,7 @@ #include "xattr.h" #include "acl.h" #include "mballoc.h" +#include "ext4_extents.h" #define CREATE_TRACE_POINTS #include @@ -1033,6 +1034,7 @@ void ext4_clear_inode(struct inode *inode) clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -5296,9 +5298,14 @@ static int __init ext4_init_fs(void) init_waitqueue_head(&ext4__ioend_wq[i]); } - err = ext4_init_pageio(); + err = ext4_init_es(); if (err) return err; + + err = ext4_init_pageio(); + if (err) + goto out7; + err = ext4_init_system_zone(); if (err) goto out6; @@ -5348,6 +5355,9 @@ out5: ext4_exit_system_zone(); out6: ext4_exit_pageio(); +out7: + ext4_exit_es(); + return err; } -- cgit v1.2.1 From 992e9fdd7b3f656ab8aea895f0038336950774ed Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:33 -0500 Subject: ext4: add some tracepoints in extent status tree This patch adds some tracepoints in extent status tree. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 02c09be3d773..564d981a2fcc 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -14,6 +14,8 @@ #include "extents_status.h" #include "ext4_extents.h" +#include + /* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent @@ -224,6 +226,8 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) struct rb_node *node; ext4_lblk_t ret = EXT_MAX_BLOCKS; + trace_ext4_es_find_extent_enter(inode, es->start); + read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -253,6 +257,8 @@ out: } read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_exit(inode, es, ret); return ret; } @@ -393,6 +399,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, struct ext4_es_tree *tree; int err = 0; + trace_ext4_es_insert_extent(inode, offset, len); es_debug("add [%u/%u) to extent status tree of inode %lu\n", offset, len, inode->i_ino); @@ -422,6 +429,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len1, len2, end; int err = 0; + trace_ext4_es_remove_extent(inode, offset, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", offset, len, inode->i_ino); -- cgit v1.2.1 From 7d1b1fbc95ebf41fee246dde437a77921f3bfec5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:35 -0500 Subject: ext4: reimplement ext4_find_delay_alloc_range on extent status tree Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 -- fs/ext4/ext4_extents.h | 3 +- fs/ext4/extents.c | 117 ++++++++----------------------------------------- fs/ext4/inode.c | 53 +--------------------- 4 files changed, 20 insertions(+), 157 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bcc634b26d46..246e38f3915a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2451,14 +2451,10 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index cb1b2c919963..603bb114735c 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -314,7 +314,6 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 67660fa2a7e6..e0bedd1a4ac1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3461,115 +3461,34 @@ out: /** * ext4_find_delalloc_range: find delayed allocated block in the given range. * - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns - * whether there are any buffers marked for delayed allocation. It returns '1' - * on the first delalloc'ed buffer head found. If no buffer head in the given - * range is marked for delalloc, it returns 0. - * lblk_start should always be <= lblk_end. - * search_hint_reverse is to indicate that searching in reverse from lblk_end to - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed - * block sooner). This is useful when blocks are truncated sequentially from - * lblk_start towards lblk_end. + * Return 1 if there is a delalloc block in the range, otherwise 0. */ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end, - int search_hint_reverse) + ext4_lblk_t lblk_end) { - struct address_space *mapping = inode->i_mapping; - struct buffer_head *head, *bh = NULL; - struct page *page; - ext4_lblk_t i, pg_lblk; - pgoff_t index; - - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; - - /* reverse search wont work if fs block size is less than page size */ - if (inode->i_blkbits < PAGE_CACHE_SHIFT) - search_hint_reverse = 0; + struct extent_status es; - if (search_hint_reverse) - i = lblk_end; + es.start = lblk_start; + ext4_es_find_extent(inode, &es); + if (es.len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.start <= lblk_start && lblk_start < es.start + es.len) + return 1; + else if (lblk_start <= es.start && es.start <= lblk_end) + return 1; else - i = lblk_start; - - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - - while ((i >= lblk_start) && (i <= lblk_end)) { - page = find_get_page(mapping, index); - if (!page) - goto nextpage; - - if (!page_has_buffers(page)) - goto nextpage; - - head = page_buffers(page); - if (!head) - goto nextpage; - - bh = head; - pg_lblk = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - do { - if (unlikely(pg_lblk < lblk_start)) { - /* - * This is possible when fs block size is less - * than page size and our cluster starts/ends in - * middle of the page. So we need to skip the - * initial few blocks till we reach the 'lblk' - */ - pg_lblk++; - continue; - } - - /* Check if the buffer is delayed allocated and that it - * is not yet mapped. (when da-buffers are mapped during - * their writeout, their da_mapped bit is set.) - */ - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 1, i); - return 1; - } - if (search_hint_reverse) - i--; - else - i++; - } while ((i >= lblk_start) && (i <= lblk_end) && - ((bh = bh->b_this_page) != head)); -nextpage: - if (page) - page_cache_release(page); - /* - * Move to next page. 'i' will be the first lblk in the next - * page. - */ - if (search_hint_reverse) - index--; - else - index++; - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - } - - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse, 0, 0); - return 0; + return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse) +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse); + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); } /** @@ -3630,7 +3549,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3640,7 +3559,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3927,7 +3846,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -4015,7 +3934,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e92349272e0..7f9ccc1381a9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,49 +483,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } -/* - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. - */ -static void set_buffers_da_mapped(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - int i, nr_pages; - pgoff_t index, end; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + map->m_len - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, - min(end - index + 1, - (pgoff_t)PAGEVEC_SIZE)); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - if (unlikely(page->mapping != mapping) || - !PageDirty(page)) - break; - - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - set_buffer_da_mapped(bh); - bh = bh->b_this_page; - } while (bh != head); - } - index++; - } - pagevec_release(&pvec); - } -} - /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -661,13 +618,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* If we have successfully mapped the delayed allocated blocks, - * set the BH_Da_Mapped bit on them. Its important to do this - * under the protection of i_data_sem. - */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret; - set_buffers_da_mapped(inode, map); delayed_mapped: /* delayed allocation blocks has been allocated */ ret = ext4_es_remove_extent(inode, map->m_lblk, @@ -1330,7 +1282,6 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); - clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); @@ -1347,7 +1298,7 @@ static void ext4_da_page_release_reservation(struct page *page, lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk, 1)) + !ext4_find_delalloc_cluster(inode, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1453,8 +1404,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } - if (buffer_da_mapped(bh)) - clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); -- cgit v1.2.1 From b3aff3e3f61d13586fd46d1ee6f7353ab3050b6d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:37 -0500 Subject: ext4: reimplement fiemap using extent status tree Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 184 +++++++----------------------------------------------- 1 file changed, 21 insertions(+), 163 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0bedd1a4ac1..d3dd6182c07a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4499,193 +4499,51 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { + struct extent_status es; __u64 logical; __u64 physical; __u64 length; __u32 flags = 0; + ext4_lblk_t next_del; int ret = 0; struct fiemap_extent_info *fieinfo = data; unsigned char blksize_bits; - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; + es.start = newex->ec_block; + next_del = ext4_es_find_extent(inode, &es); + next = min(next_del, next); if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, * then the block may stay in 1)a hole or 2)delayed-extent. - * - * Holes or delayed-extents are processed as follows. - * 1. lookup dirty pages with specified range in pagecache. - * If no page is got, then there is no delayed-extent and - * return with EXT_CONTINUE. - * 2. find the 1st mapped buffer, - * 3. check if the mapped buffer is both in the request range - * and a delayed buffer. If not, there is no delayed-extent, - * then return. - * 4. a delayed-extent is found, the extent will be collected. */ - ext4_lblk_t end = 0; - pgoff_t last_offset; - pgoff_t offset; - pgoff_t index; - pgoff_t start_index = 0; - struct page **pages = NULL; - struct buffer_head *bh = NULL; - struct buffer_head *head = NULL; - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); - - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (pages == NULL) - return -ENOMEM; - - offset = logical >> PAGE_SHIFT; -repeat: - last_offset = offset; - head = NULL; - ret = find_get_pages_tag(inode->i_mapping, &offset, - PAGECACHE_TAG_DIRTY, nr_pages, pages); - - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* First time, try to find a mapped buffer. */ - if (ret == 0) { -out: - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - /* just a hole. */ - kfree(pages); - return EXT_CONTINUE; - } - index = 0; - -next_page: - /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> - blksize_bits; - if (!page_has_buffers(pages[index])) - goto out; - head = page_buffers(pages[index]); - if (!head) - goto out; - - index++; - bh = head; - do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; - /* get the 1st mapped buffer. */ - goto found_mapped_buffer; - } - - bh = bh->b_this_page; - end++; - } while (bh != head); - - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; - } else { - /*Find contiguous delayed buffers. */ - if (ret > 0 && pages[0]->index == last_offset) - head = page_buffers(pages[0]); - bh = head; - index = 1; - start_index = 0; + if (es.len == 0) + /* A hole found. */ + return EXT_CONTINUE; + + if (es.start > newex->ec_block) { + /* A hole found. */ + newex->ec_len = min(es.start - newex->ec_block, + newex->ec_len); + return EXT_CONTINUE; } -found_mapped_buffer: - if (bh != NULL && buffer_delay(bh)) { - /* 1st or contiguous delayed buffer found. */ - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* - * 1st delayed buffer found, record - * the start of extent. - */ - flags |= FIEMAP_EXTENT_DELALLOC; - newex->ec_block = end; - logical = (__u64)end << blksize_bits; - } - /* Find contiguous delayed buffers. */ - do { - if (!buffer_delay(bh)) - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - - for (; index < ret; index++) { - if (!page_has_buffers(pages[index])) { - bh = NULL; - break; - } - head = page_buffers(pages[index]); - if (!head) { - bh = NULL; - break; - } - - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { - /* Blocks are not contiguous. */ - bh = NULL; - break; - } - bh = head; - do { - if (!buffer_delay(bh)) - /* Delayed-extent ends. */ - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - } - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) - /* a hole found. */ - goto out; - -found_delayed_extent: - newex->ec_len = min(end - newex->ec_block, - (ext4_lblk_t)EXT_INIT_MAX_LEN); - if (ret == nr_pages && bh != NULL && - newex->ec_len < EXT_INIT_MAX_LEN && - buffer_delay(bh)) { - /* Have not collected an extent and continue. */ - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - goto repeat; - } - - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - kfree(pages); + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_len = es.start + es.len - newex->ec_block; } - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (next == EXT_MAX_BLOCKS) flags |= FIEMAP_EXTENT_LAST; + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + ret = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); if (ret < 0) -- cgit v1.2.1 From c8c0df241cc2719b1262e627f999638411934f60 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:40 -0500 Subject: ext4: introduce lseek SEEK_DATA/SEEK_HOLE support This patch makes ext4 really support SEEK_DATA/SEEK_HOLE flags. Block-mapped and extent-mapped files are fully implemented together because ext4_map_blocks hides this differences. After applying this patch, it will cause a failure in xfstest #285 when the file is block-mapped due to block-mapped file isn't support fallocate(2). I had tried to use ext4_ext_walk_space() to retrieve the offset for a extent-mapped file. But finally I decide to keep using ext4_map_blocks() to support SEEK_DATA/SEEK_HOLE because ext4_map_blocks() can hide the difference between block-mapped file and extent-mapped file. Moreover, in next step, extent status tree will track all extent status, and we can get all mappings from this tree. So I think that using ext4_map_blocks() is a better choice. CC: Hugh Dickins Signed-off-by: Jie Liu Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 332 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bf3966bccd34..2f5759eb9f89 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -285,6 +286,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } +/* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int origin, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (origin == SEEK_DATA) + break; + + BUG_ON(origin != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && origin == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && origin == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (origin == SEEK_DATA) + found = 1; + } else { + if (origin == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && origin == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (dataoff > maxsize) + return -EINVAL; + + if (dataoff != file->f_pos) { + file->f_pos = dataoff; + file->f_version = 0; + } + + return dataoff; +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + last = es.start + es.len; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (holeoff > maxsize) + return -EINVAL; + + if (holeoff != file->f_pos) { + file->f_pos = holeoff; + file->f_version = 0; + } + + return holeoff; +} + /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes @@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, - maxbytes, i_size_read(inode)); + switch (origin) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; } const struct file_operations ext4_file_operations = { -- cgit v1.2.1 From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Nov 2012 15:53:35 -0800 Subject: revert "epoll: support for disabling items, and a self-test app" Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1c..cd96649bfe62 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); -- cgit v1.2.1 From 848561d368751a1c0f679b9f045a02944506a801 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 8 Nov 2012 15:53:37 -0800 Subject: fanotify: fix missing break Anders Blomdell noted in 2010 that Fanotify lost events and provided a test case. Eric Paris confirmed it was a bug and posted a fix to the list https://groups.google.com/forum/?fromgroups=#!topic/linux.kernel/RrJfTfyW2BE but never applied it. Repeated attempts over time to actually get him to apply it have never had a reply from anyone who has raised it So apply it anyway Signed-off-by: Alan Cox Reported-by: Anders Blomdell Cc: Eric Paris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f35794b97e8e..a50636025364 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): return true; default: -- cgit v1.2.1 From 5ffd3412ae5536a4c57469cb8ea31887121dcb2e Mon Sep 17 00:00:00 2001 From: Thomas Betker Date: Wed, 17 Oct 2012 22:59:30 +0200 Subject: jffs2: Fix lock acquisition order bug in jffs2_write_begin jffs2_write_begin() first acquires the page lock, then f->sem. This causes an AB-BA deadlock with jffs2_garbage_collect_live(), which first acquires f->sem, then the page lock: jffs2_garbage_collect_live mutex_lock(&f->sem) (A) jffs2_garbage_collect_dnode jffs2_gc_fetch_page read_cache_page_async do_read_cache_page lock_page(page) (B) jffs2_write_begin grab_cache_page_write_begin find_lock_page lock_page(page) (B) mutex_lock(&f->sem) (A) We fix this by restructuring jffs2_write_begin() to take f->sem before the page lock. However, we make sure that f->sem is not held when calling jffs2_reserve_space(), as this is not permitted by the locking rules. The deadlock above was observed multiple times on an SoC with a dual ARMv7 (Cortex-A9), running the long-term 3.4.11 kernel; it occurred when using scp to copy files from a host system to the ARM target system. The fix was heavily tested on the same target system. Cc: stable@vger.kernel.org Signed-off-by: Thomas Betker Acked-by: Joakim Tjernlund Signed-off-by: Artem Bityutskiy --- fs/jffs2/file.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 60ef3fb707ff..1506673c087e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; + uint32_t alloc_len = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + return ret; + } + + mutex_lock(&f->sem); pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) + if (!pg) { + if (alloc_len) + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); return -ENOMEM; + } *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); - - if (pageofs > inode->i_size) { + if (alloc_len) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; - uint32_t alloc_len; jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", (unsigned int)inode->i_size, pageofs); - ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, - ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - if (ret) - goto out_page; - - mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; - mutex_unlock(&f->sem); } /* @@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { - mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); - mutex_unlock(&f->sem); if (ret) goto out_page; } + mutex_unlock(&f->sem); jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: unlock_page(pg); page_cache_release(pg); + mutex_unlock(&f->sem); return ret; } -- cgit v1.2.1 From 698d8d875a0593f65092f6619d97de49bc5caa45 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Nov 2012 15:31:53 -0500 Subject: nfsd: fix error handling in nfsd4_remove_clid_dir If the credential save fails, then we'll leak our mnt_want_write_file reference. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 43295d45cc2b..0f1e2e21f7d9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -301,12 +301,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfs4_save_creds(&original_cred); if (status < 0) - goto out; + goto out_drop_write; status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); if (status == 0) vfs_fsync(rec_file, 0); +out_drop_write: mnt_drop_write_file(rec_file); out: if (status) -- cgit v1.2.1 From a0af710a6510213672d28f83681c391d36a7555e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Nov 2012 15:06:38 -0500 Subject: nfsd: remove unused argument to nfs4_has_reclaimed_state Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 2 +- fs/nfsd/nfs4state.c | 2 +- fs/nfsd/state.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 0f1e2e21f7d9..151921bd164e 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -320,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child) { int status; - if (nfs4_has_reclaimed_state(child->d_name.name, false)) + if (nfs4_has_reclaimed_state(child->d_name.name)) return 0; status = vfs_rmdir(parent->d_inode, child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 13f3471b02a2..d6b602a92657 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4484,7 +4484,7 @@ alloc_reclaim(void) } int -nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) +nfs4_has_reclaimed_state(const char *name) { unsigned int strhashval = clientstr_hashval(name); struct nfs4_client *clp; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0498053b8f0e..8053b5747960 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -482,7 +482,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern int nfs4_client_to_reclaim(const char *name); -extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern int nfs4_has_reclaimed_state(const char *name); extern void release_session_client(struct nfsd4_session *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); -- cgit v1.2.1 From dffe9d8da715bed4d395883add90a2d150d85729 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 10 Nov 2012 22:20:05 -0500 Subject: ext4: do not use ext4_error() when there is no space in dir leaf for csum If there is no space for a checksum in a directory leaf node, previously we would use EXT4_ERROR_INODE() which would mark the file system as inconsistent. While it would be nice to use e2fsck -D, it certainly isn't required, so just print a warning using ext4_warning(). Signed-off-by: "Theodore Ts'o" Cc: "Darrick J. Wong" --- fs/ext4/namei.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d600a69fc9d..580af3dfc0eb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -261,6 +261,12 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } +static void warn_no_space_for_csum(struct inode *inode) +{ + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " + "checksum. Please run e2fsck -D.", inode->i_ino); +} + int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; @@ -271,8 +277,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return 0; } @@ -294,8 +299,7 @@ static void ext4_dirent_csum_set(struct inode *inode, t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return; } @@ -377,8 +381,7 @@ static int ext4_dx_csum_verify(struct inode *inode, count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum found. Run e2fsck -D."); + warn_no_space_for_csum(inode); return 1; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -408,8 +411,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum. Run e2fsck -D."); + warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); -- cgit v1.2.1 From 5a8477660d9ddc090203736d7271137265cb25bb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Nov 2012 01:19:02 -0500 Subject: kill bogus BUG_ON() in do_close_on_exec() It can be legitimately triggered via procfs access. Now, at least 2 of 3 of get_files_struct() callers in procfs are useless, but when and if we get rid of those we can always add WARN_ON() here. BUG_ON() at that spot is simply wrong. Signed-off-by: Al Viro --- fs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index d3b5fa80b71b..331e7d24d9d3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -685,7 +685,6 @@ void do_close_on_exec(struct files_struct *files) struct fdtable *fdt; /* exec unshares first */ - BUG_ON(atomic_read(&files->count) != 1); spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; -- cgit v1.2.1 From 2873d2147e1e14b82367bde14354a011ffda0496 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:48 -0500 Subject: nfsd: add a usermodehelper upcall for NFSv4 client ID tracking Add a new client tracker upcall type that uses call_usermodehelper to call out to a program. This seems to be the preferred method of calling out to usermode these days for seldom-called upcalls. It's simple and doesn't require a running daemon, so it should "just work" as long as the binary is installed. The client tracking exit operation is also changed to check for a NULL pointer before running. The UMH upcall doesn't need to do anything at module teardown time. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 151921bd164e..2fc2f6cb8d95 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -927,6 +927,137 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .grace_done = nfsd4_cld_grace_done, }; +/* upcall via usermodehelper */ +static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; +module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), + S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); + +static int +nfsd4_umh_cltrack_upcall(char *cmd, char *arg) +{ + char *envp[] = { NULL }; + char *argv[4]; + int ret; + + if (unlikely(!cltrack_prog[0])) { + dprintk("%s: cltrack_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s: cmd: %s\n", __func__, cmd); + dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); + + argv[0] = (char *)cltrack_prog; + argv[1] = cmd; + argv[2] = arg; + argv[3] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or EACCES + * error. The admin can re-enable it on the fly by using sysfs + * once the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + dprintk("NFSD: %s was not found or isn't executable (%d). " + "Setting cltrack_prog to blank string!", + cltrack_prog, ret); + cltrack_prog[0] = '\0'; + } + dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); + + return ret; +} + +static char * +bin_to_hex_dup(const unsigned char *src, int srclen) +{ + int i; + char *buf, *hex; + + /* +1 for terminating NULL */ + buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); + if (!buf) + return buf; + + hex = buf; + for (i = 0; i < srclen; i++) { + sprintf(hex, "%2.2x", *src++); + hex += 2; + } + return buf; +} + +static int +nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) +{ + return nfsd4_umh_cltrack_upcall("init", NULL); +} + +static void +nfsd4_umh_cltrack_create(struct nfs4_client *clp) +{ + char *hexid; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + nfsd4_umh_cltrack_upcall("create", hexid); + kfree(hexid); +} + +static void +nfsd4_umh_cltrack_remove(struct nfs4_client *clp) +{ + char *hexid; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + nfsd4_umh_cltrack_upcall("remove", hexid); + kfree(hexid); +} + +static int +nfsd4_umh_cltrack_check(struct nfs4_client *clp) +{ + int ret; + char *hexid; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return -ENOMEM; + } + ret = nfsd4_umh_cltrack_upcall("check", hexid); + kfree(hexid); + return ret; +} + +static void +nfsd4_umh_cltrack_grace_done(struct net __attribute__((unused)) *net, + time_t boot_time) +{ + char timestr[22]; /* FIXME: better way to determine max size? */ + + sprintf(timestr, "%ld", boot_time); + nfsd4_umh_cltrack_upcall("gracedone", timestr); +} + +static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { + .init = nfsd4_umh_cltrack_init, + .exit = NULL, + .create = nfsd4_umh_cltrack_create, + .remove = nfsd4_umh_cltrack_remove, + .check = nfsd4_umh_cltrack_check, + .grace_done = nfsd4_umh_cltrack_grace_done, +}; + int nfsd4_client_tracking_init(struct net *net) { @@ -957,7 +1088,8 @@ void nfsd4_client_tracking_exit(struct net *net) { if (client_tracking_ops) { - client_tracking_ops->exit(net); + if (client_tracking_ops->exit) + client_tracking_ops->exit(net); client_tracking_ops = NULL; } } -- cgit v1.2.1 From 2d77bf0a55d64559adb2d48a37bc7e876d6adc11 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:49 -0500 Subject: nfsd: change heuristic for selecting the client_tracking_ops First, try to use the new usermodehelper upcall. It should succeed or fail quickly, so there's little cost to doing so. If it fails, and the legacy tracking dir exists, use that. If it doesn't exist then fall back to using nfsdcld. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 2fc2f6cb8d95..e71f713bd7c0 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1064,17 +1064,35 @@ nfsd4_client_tracking_init(struct net *net) int status; struct path path; - if (!client_tracking_ops) { - client_tracking_ops = &nfsd4_cld_tracking_ops; - status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); - if (!status) { - if (S_ISDIR(path.dentry->d_inode->i_mode)) - client_tracking_ops = - &nfsd4_legacy_tracking_ops; - path_put(&path); - } + /* just run the init if it the method is already decided */ + if (client_tracking_ops) + goto do_init; + + /* + * First, try a UMH upcall. It should succeed or fail quickly, so + * there's little harm in trying that first. + */ + client_tracking_ops = &nfsd4_umh_tracking_ops; + status = client_tracking_ops->init(net); + if (!status) + return status; + + /* + * See if the recoverydir exists and is a directory. If it is, + * then use the legacy ops. + */ + client_tracking_ops = &nfsd4_legacy_tracking_ops; + status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); + if (!status) { + status = S_ISDIR(path.dentry->d_inode->i_mode); + path_put(&path); + if (status) + goto do_init; } + /* Finally, try to use nfsdcld */ + client_tracking_ops = &nfsd4_cld_tracking_ops; +do_init: status = client_tracking_ops->init(net); if (status) { printk(KERN_WARNING "NFSD: Unable to initialize client " -- cgit v1.2.1 From f3aa7e24c91ee3fd387150c2c5a9934b09f44ec5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:50 -0500 Subject: nfsd: pass info about the legacy recoverydir in environment variables The usermodehelper upcall program can then decide to use this info as a (one-way) transition mechanism to the new scheme. When a "check" upcall occurs and the client doesn't exist in the database, we can look to see whether the directory exists. If it does, then we'd add the client to the database, remove the legacy recdir, and return success to the kernel to allow the recovery to proceed. For gracedone, we simply pass the v4recovery "topdir" so that the upcall can clean it out prior to returning to the kernel. A module parm is also added to disable the legacy conversion if the admin chooses. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e71f713bd7c0..38af61556895 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -933,10 +933,75 @@ module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); +static bool cltrack_legacy_disable; +module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_legacy_disable, + "Disable legacy recoverydir conversion. Default: false"); + +#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" +#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" + +static char * +nfsd4_cltrack_legacy_topdir(void) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", + nfs4_recoverydir()); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_legacy_recdir(const char *recdir) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + /* +1 is for '/' between "topdir" and "recdir" */ + len = strlen(LEGACY_RECDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/%s", + nfs4_recoverydir(), recdir); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + static int -nfsd4_umh_cltrack_upcall(char *cmd, char *arg) +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) { - char *envp[] = { NULL }; + char *envp[2]; char *argv[4]; int ret; @@ -947,6 +1012,10 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg) dprintk("%s: cmd: %s\n", __func__, cmd); dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); + dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); + + envp[0] = legacy; + envp[1] = NULL; argv[0] = (char *)cltrack_prog; argv[1] = cmd; @@ -992,7 +1061,7 @@ bin_to_hex_dup(const unsigned char *src, int srclen) static int nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) { - return nfsd4_umh_cltrack_upcall("init", NULL); + return nfsd4_umh_cltrack_upcall("init", NULL, NULL); } static void @@ -1005,7 +1074,7 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp) dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("create", hexid); + nfsd4_umh_cltrack_upcall("create", hexid, NULL); kfree(hexid); } @@ -1019,7 +1088,7 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp) dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("remove", hexid); + nfsd4_umh_cltrack_upcall("remove", hexid, NULL); kfree(hexid); } @@ -1027,14 +1096,16 @@ static int nfsd4_umh_cltrack_check(struct nfs4_client *clp) { int ret; - char *hexid; + char *hexid, *legacy; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } - ret = nfsd4_umh_cltrack_upcall("check", hexid); + legacy = nfsd4_cltrack_legacy_recdir(clp->cl_recdir); + ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); + kfree(legacy); kfree(hexid); return ret; } @@ -1043,10 +1114,13 @@ static void nfsd4_umh_cltrack_grace_done(struct net __attribute__((unused)) *net, time_t boot_time) { + char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ sprintf(timestr, "%ld", boot_time); - nfsd4_umh_cltrack_upcall("gracedone", timestr); + legacy = nfsd4_cltrack_legacy_topdir(); + nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); + kfree(legacy); } static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { -- cgit v1.2.1 From 8b0554e9a24298c91de89a779a714c87073380a2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:51 -0500 Subject: nfsd: warn about impending removal of nfsdcld upcall Let's shoot for removing the nfsdcld upcall in 3.10. Most likely, no one is actually using it so I don't expect this warning to fire often (except maybe on misconfigured systems). Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 38af61556895..6aaf5d92a43c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1166,6 +1166,9 @@ nfsd4_client_tracking_init(struct net *net) /* Finally, try to use nfsdcld */ client_tracking_ops = &nfsd4_cld_tracking_ops; + printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be " + "removed in 3.10. Please transition to using " + "nfsdcltrack.\n"); do_init: status = client_tracking_ops->init(net); if (status) { -- cgit v1.2.1 From 278c931cb05ae624df8c82b6bdfbb0e03392cde7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:52 -0500 Subject: nfsd: have nfsd4_find_reclaim_client take a char * argument Currently, it takes a client pointer, but later we're going to need to search for these records without knowing whether a matching client even exists. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 2 +- fs/nfsd/nfs4state.c | 11 ++++------- fs/nfsd/state.h | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 6aaf5d92a43c..4e92fb38cfb2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -486,7 +486,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) return 0; /* look for it in the reclaim hashtable otherwise */ - if (nfsd4_find_reclaim_client(clp)) { + if (nfsd4_find_reclaim_client(clp->cl_recdir)) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d6b602a92657..18e554942da3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4537,19 +4537,16 @@ nfs4_release_reclaim(void) /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ struct nfs4_client_reclaim * -nfsd4_find_reclaim_client(struct nfs4_client *clp) +nfsd4_find_reclaim_client(const char *recdir) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; - dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", - clp->cl_name.len, clp->cl_name.data, - clp->cl_recdir); + dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir); - /* find clp->cl_name in reclaim_str_hashtbl */ - strhashval = clientstr_hashval(clp->cl_recdir); + strhashval = clientstr_hashval(recdir); list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { - if (same_name(crp->cr_recdir, clp->cl_recdir)) { + if (same_name(crp->cr_recdir, recdir)) { return crp; } } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8053b5747960..c41c28020cad 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -466,7 +466,7 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern void nfs4_release_reclaim(void); -extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); extern void nfs4_free_openowner(struct nfs4_openowner *); extern void nfs4_free_lockowner(struct nfs4_lockowner *); -- cgit v1.2.1 From ce30e5392fcb26b6aa53bb16d06da1d7d8bb0863 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:53 -0500 Subject: nfsd: break out reclaim record removal into separate function We'll need to be able to call this from nfs4recover.c eventually. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 12 +++++++++--- fs/nfsd/state.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 18e554942da3..24dcda2b327c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4516,6 +4516,14 @@ nfs4_client_to_reclaim(const char *name) return 1; } +void +nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp) +{ + list_del(&crp->cr_strhash); + kfree(crp); + reclaim_str_hashtbl_size--; +} + void nfs4_release_reclaim(void) { @@ -4526,9 +4534,7 @@ nfs4_release_reclaim(void) while (!list_empty(&reclaim_str_hashtbl[i])) { crp = list_entry(reclaim_str_hashtbl[i].next, struct nfs4_client_reclaim, cr_strhash); - list_del(&crp->cr_strhash); - kfree(crp); - reclaim_str_hashtbl_size--; + nfs4_remove_reclaim_record(crp); } } BUG_ON(reclaim_str_hashtbl_size); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c41c28020cad..3528616c955e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -465,6 +465,7 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net, extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); +void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *); extern void nfs4_release_reclaim(void); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); -- cgit v1.2.1 From 772a9bbbb5769c646c74452ef21df538bbe2ebf0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:54 -0500 Subject: nfsd: make nfs4_client_to_reclaim return a pointer to the reclaim record Later callers will need to make changes to the record. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 20 ++++++++++---------- fs/nfsd/state.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 24dcda2b327c..1c6f82e4335e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4498,22 +4498,22 @@ nfs4_has_reclaimed_state(const char *name) /* * failure => all reset bets are off, nfserr_no_grace... */ -int +struct nfs4_client_reclaim * nfs4_client_to_reclaim(const char *name) { unsigned int strhashval; - struct nfs4_client_reclaim *crp = NULL; + struct nfs4_client_reclaim *crp; dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); crp = alloc_reclaim(); - if (!crp) - return 0; - strhashval = clientstr_hashval(name); - INIT_LIST_HEAD(&crp->cr_strhash); - list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); - memcpy(crp->cr_recdir, name, HEXDIR_LEN); - reclaim_str_hashtbl_size++; - return 1; + if (crp) { + strhashval = clientstr_hashval(name); + INIT_LIST_HEAD(&crp->cr_strhash); + list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); + memcpy(crp->cr_recdir, name, HEXDIR_LEN); + reclaim_str_hashtbl_size++; + } + return crp; } void diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3528616c955e..3f8b26b9b47b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -482,7 +482,7 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern int nfs4_client_to_reclaim(const char *name); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name); extern void release_session_client(struct nfsd4_session *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); -- cgit v1.2.1 From 0ce0c2b5d23080eec39ccc52354be1eea326ed5f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:55 -0500 Subject: nfsd: don't search for client by hash on legacy reboot recovery gracedone When nfsd starts, the legacy reboot recovery code creates a tracking struct for each directory in the v4recoverydir. When the grace period ends, it basically does a "readdir" on the directory again, and matches each dentry in there to an existing client id to see if it should be removed or not. If the matching client doesn't exist, or hasn't reclaimed its state then it will remove that dentry. This is pretty inefficient since it involves doing a lot of hash-bucket searching. It also means that we have to keep relying on being able to search for a nfs4_client by md5 hashed cl_recdir name. Instead, add a pointer to the nfs4_client that indicates the association between the nfs4_client_reclaim and nfs4_client. When a reclaim operation comes in, we set the pointer to make that association. On gracedone, the legacy client tracker will keep the recdir around iff: 1/ there is a reclaim record for the directory ...and... 2/ there's an association between the reclaim record and a client record -- that is, a create or check operation was performed on the client that matches that directory. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 31 +++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 12 +++++------- fs/nfsd/state.h | 4 ++-- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 4e92fb38cfb2..3048c012d4bc 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -65,6 +65,7 @@ struct nfsd4_client_tracking_ops { static struct file *rec_file; static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static struct nfsd4_client_tracking_ops *client_tracking_ops; +static bool in_grace; static int nfs4_save_creds(const struct cred **original_creds) @@ -142,6 +143,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) const struct cred *original_cred; char *dname = clp->cl_recdir; struct dentry *dir, *dentry; + struct nfs4_client_reclaim *crp; int status; dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); @@ -182,13 +184,19 @@ out_put: dput(dentry); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); - if (status == 0) + if (status == 0) { + if (in_grace) { + crp = nfs4_client_to_reclaim(clp->cl_recdir); + if (crp) + crp->cr_clp = clp; + } vfs_fsync(rec_file, 0); - else + } else { printk(KERN_ERR "NFSD: failed to write recovery record" " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); + } mnt_drop_write_file(rec_file); nfs4_reset_creds(original_cred); } @@ -289,6 +297,7 @@ static void nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; + struct nfs4_client_reclaim *crp; int status; if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -305,8 +314,15 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); - if (status == 0) + if (status == 0) { vfs_fsync(rec_file, 0); + if (in_grace) { + /* remove reclaim record */ + crp = nfsd4_find_reclaim_client(clp->cl_recdir); + if (crp) + nfs4_remove_reclaim_record(crp); + } + } out_drop_write: mnt_drop_write_file(rec_file); out: @@ -336,6 +352,7 @@ nfsd4_recdir_purge_old(struct net *net, time_t boot_time) { int status; + in_grace = false; if (!rec_file) return; status = mnt_want_write_file(rec_file); @@ -410,6 +427,8 @@ nfsd4_init_recdir(void) } nfs4_reset_creds(original_cred); + if (!status) + in_grace = true; return status; } @@ -481,13 +500,17 @@ nfs4_recoverydir(void) static int nfsd4_check_legacy_client(struct nfs4_client *clp) { + struct nfs4_client_reclaim *crp; + /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; /* look for it in the reclaim hashtable otherwise */ - if (nfsd4_find_reclaim_client(clp->cl_recdir)) { + crp = nfsd4_find_reclaim_client(clp->cl_recdir); + if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + crp->cr_clp = clp; return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1c6f82e4335e..559ab574d46b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4483,16 +4483,13 @@ alloc_reclaim(void) return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); } -int +bool nfs4_has_reclaimed_state(const char *name) { - unsigned int strhashval = clientstr_hashval(name); - struct nfs4_client *clp; + struct nfs4_client_reclaim *crp; - clp = find_confirmed_client_by_str(name, strhashval); - if (!clp) - return 0; - return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + crp = nfsd4_find_reclaim_client(name); + return (crp && crp->cr_clp); } /* @@ -4511,6 +4508,7 @@ nfs4_client_to_reclaim(const char *name) INIT_LIST_HEAD(&crp->cr_strhash); list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); memcpy(crp->cr_recdir, name, HEXDIR_LEN); + crp->cr_clp = NULL; reclaim_str_hashtbl_size++; } return crp; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3f8b26b9b47b..cf9f7ba4df8d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -304,6 +304,7 @@ is_client_expired(struct nfs4_client *clp) */ struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ + struct nfs4_client *cr_clp; /* pointer to associated clp */ char cr_recdir[HEXDIR_LEN]; /* recover dir */ }; @@ -464,7 +465,6 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net, stateid_t *stateid, int flags, struct file **filp); extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); -extern int nfs4_in_grace(void); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *); extern void nfs4_release_reclaim(void); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir); @@ -483,7 +483,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name); -extern int nfs4_has_reclaimed_state(const char *name); +extern bool nfs4_has_reclaimed_state(const char *name); extern void release_session_client(struct nfsd4_session *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); -- cgit v1.2.1 From ac55fdc408039b425a2fa3cbcaed7444e5339f9a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:56 -0500 Subject: nfsd: move the confirmed and unconfirmed hlists to a rbtree The current code requires that we md5 hash the name in order to store the client in the confirmed and unconfirmed trees. Change it instead to store the clients in a pair of rbtrees, and simply compare the cl_names directly instead of hashing them. This also necessitates that we add a new flag to the clp->cl_flags field to indicate which tree the client is currently in. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 144 +++++++++++++++++++++++++++++++++------------------- fs/nfsd/state.h | 3 +- 2 files changed, 95 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 559ab574d46b..99998a1eb426 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -412,10 +412,10 @@ static unsigned int clientstr_hashval(const char *name) * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing * - * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed + * conf_id_hashtbl[], and conf_name_tree hold confirmed * setclientid_confirmed info. * - * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * unconf_id_hashtbl[] and unconf_name_tree hold unconfirmed * setclientid info. * * client_lru holds client queue ordered by nfs4_client.cl_time @@ -423,13 +423,15 @@ static unsigned int clientstr_hashval(const char *name) * * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time * for last close replay. + * + * All of the above fields are protected by the client_mutex. */ static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE]; static int reclaim_str_hashtbl_size = 0; static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE]; -static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE]; static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; +static struct rb_root conf_name_tree; +static struct rb_root unconf_name_tree; static struct list_head client_lru; static struct list_head close_lru; @@ -1144,7 +1146,10 @@ destroy_client(struct nfs4_client *clp) if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); list_del(&clp->cl_idhash); - list_del(&clp->cl_strhash); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + rb_erase(&clp->cl_namenode, &conf_name_tree); + else + rb_erase(&clp->cl_namenode, &unconf_name_tree); spin_lock(&client_lock); unhash_client_locked(clp); if (atomic_read(&clp->cl_refcount) == 0) @@ -1187,6 +1192,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) return 0; } +static long long +compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) +{ + long long res; + + res = o1->len - o2->len; + if (res) + return res; + return (long long)memcmp(o1->data, o2->data, o1->len); +} + static int same_name(const char *n1, const char *n2) { return 0 == memcmp(n1, n2, HEXDIR_LEN); @@ -1307,7 +1323,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); @@ -1325,11 +1340,52 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } static void -add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) +add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct nfs4_client *clp; + + while (*new) { + clp = rb_entry(*new, struct nfs4_client, cl_namenode); + parent = *new; + + if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_clp->cl_namenode, parent, new); + rb_insert_color(&new_clp->cl_namenode, root); +} + +static struct nfs4_client * +find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) +{ + long long cmp; + struct rb_node *node = root->rb_node; + struct nfs4_client *clp; + + while (node) { + clp = rb_entry(node, struct nfs4_client, cl_namenode); + cmp = compare_blob(&clp->cl_name, name); + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + return clp; + } + return NULL; +} + +static void +add_to_unconfirmed(struct nfs4_client *clp) { unsigned int idhashval; - list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); + clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + add_clp_to_name_tree(clp, &unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); renew_client(clp); @@ -1339,12 +1395,12 @@ static void move_to_confirmed(struct nfs4_client *clp) { unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); - unsigned int strhashval; dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); - strhashval = clientstr_hashval(clp->cl_recdir); - list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + rb_erase(&clp->cl_namenode, &unconf_name_tree); + add_clp_to_name_tree(clp, &conf_name_tree); + set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); renew_client(clp); } @@ -1387,27 +1443,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp) } static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval) +find_confirmed_client_by_name(struct xdr_netobj *name) { - struct nfs4_client *clp; - - list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) - return clp; - } - return NULL; + return find_clp_in_name_tree(name, &conf_name_tree); } static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +find_unconfirmed_client_by_name(struct xdr_netobj *name) { - struct nfs4_client *clp; - - list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) - return clp; - } - return NULL; + return find_clp_in_name_tree(name, &unconf_name_tree); } static void @@ -1572,7 +1616,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, { struct nfs4_client *unconf, *conf, *new; __be32 status; - unsigned int strhashval; char dname[HEXDIR_LEN]; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; @@ -1605,11 +1648,9 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, if (status) return status; - strhashval = clientstr_hashval(dname); - /* Cases below refer to rfc 5661 section 18.35.4: */ nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval); + conf = find_confirmed_client_by_name(&exid->clname); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); bool verfs_match = same_verf(&verf, &conf->cl_verifier); @@ -1654,7 +1695,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, goto out; } - unconf = find_unconfirmed_client_by_str(dname, strhashval); + unconf = find_unconfirmed_client_by_name(&exid->clname); if (unconf) /* case 4, possible retry or client restart */ expire_client(unconf); @@ -1668,7 +1709,7 @@ out_new: new->cl_minorversion = 1; gen_clid(new); - add_to_unconfirmed(new, strhashval); + add_to_unconfirmed(new); out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; exid->clientid.cl_id = new->cl_clientid.cl_id; @@ -1789,7 +1830,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } } else if (unconf) { - unsigned int hash; struct nfs4_client *old; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { @@ -1803,8 +1843,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfserr_seq_misordered; goto out_free_conn; } - hash = clientstr_hashval(unconf->cl_recdir); - old = find_confirmed_client_by_str(unconf->cl_recdir, hash); + old = find_confirmed_client_by_name(&unconf->cl_name); if (old) expire_client(old); move_to_confirmed(unconf); @@ -2195,7 +2234,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; - unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; char dname[HEXDIR_LEN]; @@ -2204,11 +2242,9 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - strhashval = clientstr_hashval(dname); - /* Cases below refer to rfc 3530 section 14.2.33: */ nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval); + conf = find_confirmed_client_by_name(&clname); if (conf) { /* case 0: */ status = nfserr_clid_inuse; @@ -2223,7 +2259,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } } - unconf = find_unconfirmed_client_by_str(dname, strhashval); + unconf = find_unconfirmed_client_by_name(&clname); if (unconf) expire_client(unconf); status = nfserr_jukebox; @@ -2237,7 +2273,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, gen_clid(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); - add_to_unconfirmed(new, strhashval); + add_to_unconfirmed(new); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); @@ -2290,9 +2326,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfsd4_probe_callback(conf); expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ - unsigned int hash = clientstr_hashval(unconf->cl_recdir); - - conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); + conf = find_confirmed_client_by_name(&unconf->cl_name); if (conf) expire_client(conf); move_to_confirmed(unconf); @@ -4706,11 +4740,11 @@ nfs4_state_init(void) for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); - INIT_LIST_HEAD(&conf_str_hashtbl[i]); - INIT_LIST_HEAD(&unconf_str_hashtbl[i]); INIT_LIST_HEAD(&unconf_id_hashtbl[i]); INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); } + conf_name_tree = RB_ROOT; + unconf_name_tree = RB_ROOT; for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&sessionid_hashtbl[i]); for (i = 0; i < FILE_HASH_SIZE; i++) { @@ -4795,6 +4829,7 @@ out_recovery: return ret; } +/* should be called with the state lock held */ static void __nfs4_state_shutdown(void) { @@ -4802,17 +4837,24 @@ __nfs4_state_shutdown(void) struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; + struct rb_node *node, *tmp; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); destroy_client(clp); } - while (!list_empty(&unconf_str_hashtbl[i])) { - clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash); - destroy_client(clp); - } } + + node = rb_first(&unconf_name_tree); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + clp = rb_entry(tmp, struct nfs4_client, cl_namenode); + rb_erase(tmp, &unconf_name_tree); + destroy_client(clp); + } + INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); list_for_each_safe(pos, next, &del_recall_lru) { diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index cf9f7ba4df8d..6c342bd806e5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -232,7 +232,7 @@ struct nfsd4_sessionid { */ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ - struct list_head cl_strhash; /* hash by cl_name */ + struct rb_node cl_namenode; /* link into by-name trees */ struct list_head cl_openowners; struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; @@ -253,6 +253,7 @@ struct nfs4_client { #define NFSD4_CLIENT_CB_KILL (1) #define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ #define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ +#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) unsigned long cl_flags; -- cgit v1.2.1 From 2216d449a97927cc105912e337d169cd4d4db548 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:57 -0500 Subject: nfsd: get rid of cl_recdir field Remove the cl_recdir field from the nfs4_client struct. Instead, just compute it on the fly when and if it's needed, which is now only when the legacy client tracking code is in effect. The error handling in the legacy client tracker is also changed to handle the case where md5 is unavailable. In that case, we'll warn the admin with a KERN_ERR message and disable the client tracking. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 93 ++++++++++++++++++++++++++++++++++++++++----------- fs/nfsd/nfs4state.c | 18 ++-------- fs/nfsd/state.h | 2 -- 3 files changed, 77 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3048c012d4bc..80e77cc14250 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -103,33 +103,39 @@ md5_to_hex(char *out, char *md5) *out = '\0'; } -__be32 -nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) +static int +nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) { struct xdr_netobj cksum; struct hash_desc desc; struct scatterlist sg; - __be32 status = nfserr_jukebox; + int status; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) + if (IS_ERR(desc.tfm)) { + status = PTR_ERR(desc.tfm); goto out_no_tfm; + } + cksum.len = crypto_hash_digestsize(desc.tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) + if (cksum.data == NULL) { + status = -ENOMEM; goto out; + } sg_init_one(&sg, clname->data, clname->len); - if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data)) + status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data); + if (status) goto out; md5_to_hex(dname, cksum.data); - status = nfs_ok; + status = 0; out: kfree(cksum.data); crypto_free_hash(desc.tfm); @@ -137,11 +143,36 @@ out_no_tfm: return status; } +/* + * If we had an error generating the recdir name for the legacy tracker + * then warn the admin. If the error doesn't appear to be transient, + * then disable recovery tracking. + */ +static void +legacy_recdir_name_error(int error) +{ + printk(KERN_ERR "NFSD: unable to generate recoverydir " + "name (%d).\n", error); + + /* + * if the algorithm just doesn't exist, then disable the recovery + * tracker altogether. The crypto libs will generally return this if + * FIPS is enabled as well. + */ + if (error == -ENOENT) { + printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " + "Reboot recovery will not function correctly!\n"); + + /* the argument is ignored by the legacy exit function */ + nfsd4_client_tracking_exit(NULL); + } +} + static void nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; - char *dname = clp->cl_recdir; + char dname[HEXDIR_LEN]; struct dentry *dir, *dentry; struct nfs4_client_reclaim *crp; int status; @@ -152,6 +183,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) return; if (!rec_file) return; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(status); + status = nfs4_save_creds(&original_cred); if (status < 0) return; @@ -186,7 +222,7 @@ out_unlock: mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { if (in_grace) { - crp = nfs4_client_to_reclaim(clp->cl_recdir); + crp = nfs4_client_to_reclaim(dname); if (crp) crp->cr_clp = clp; } @@ -298,11 +334,16 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; struct nfs4_client_reclaim *crp; + char dname[HEXDIR_LEN]; int status; if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(status); + status = mnt_want_write_file(rec_file); if (status) goto out; @@ -312,13 +353,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (status < 0) goto out_drop_write; - status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); + status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(rec_file, 0); if (in_grace) { /* remove reclaim record */ - crp = nfsd4_find_reclaim_client(clp->cl_recdir); + crp = nfsd4_find_reclaim_client(dname); if (crp) nfs4_remove_reclaim_record(crp); } @@ -328,7 +369,7 @@ out_drop_write: out: if (status) printk("NFSD: Failed to remove expired client state directory" - " %.*s\n", HEXDIR_LEN, clp->cl_recdir); + " %.*s\n", HEXDIR_LEN, dname); } static int @@ -500,14 +541,22 @@ nfs4_recoverydir(void) static int nfsd4_check_legacy_client(struct nfs4_client *clp) { + int status; + char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) { + legacy_recdir_name_error(status); + return status; + } + /* look for it in the reclaim hashtable otherwise */ - crp = nfsd4_find_reclaim_client(clp->cl_recdir); + crp = nfsd4_find_reclaim_client(dname); if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); crp->cr_clp = clp; @@ -993,7 +1042,7 @@ nfsd4_cltrack_legacy_topdir(void) } static char * -nfsd4_cltrack_legacy_recdir(const char *recdir) +nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) { int copied; size_t len; @@ -1010,10 +1059,16 @@ nfsd4_cltrack_legacy_recdir(const char *recdir) if (!result) return result; - copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/%s", - nfs4_recoverydir(), recdir); - if (copied >= len) { - /* just return nothing if output was truncated */ + copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", + nfs4_recoverydir()); + if (copied > (len - HEXDIR_LEN)) { + /* just return nothing if output will be truncated */ + kfree(result); + return NULL; + } + + copied = nfs4_make_rec_clidname(result + copied, name); + if (copied) { kfree(result); return NULL; } @@ -1126,7 +1181,7 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp) dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } - legacy = nfsd4_cltrack_legacy_recdir(clp->cl_recdir); + legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); kfree(legacy); kfree(hexid); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 99998a1eb426..37b19f7948ed 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1299,7 +1299,7 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t return NULL; } -static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, +static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { struct nfs4_client *clp; @@ -1319,7 +1319,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, return NULL; } idr_init(&clp->cl_stateids); - memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; INIT_LIST_HEAD(&clp->cl_idhash); @@ -1616,7 +1615,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, { struct nfs4_client *unconf, *conf, *new; __be32 status; - char dname[HEXDIR_LEN]; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; struct sockaddr *sa = svc_addr(rqstp); @@ -1643,11 +1641,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_serverfault; /* no excuse :-/ */ } - status = nfs4_make_rec_clidname(dname, &exid->clname); - - if (status) - return status; - /* Cases below refer to rfc 5661 section 18.35.4: */ nfs4_lock_state(); conf = find_confirmed_client_by_name(&exid->clname); @@ -1701,7 +1694,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, /* case 1 (normal case) */ out_new: - new = create_client(exid->clname, dname, rqstp, &verf); + new = create_client(exid->clname, rqstp, &verf); if (new == NULL) { status = nfserr_jukebox; goto out; @@ -2236,12 +2229,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_verifier clverifier = setclid->se_verf; struct nfs4_client *conf, *unconf, *new; __be32 status; - char dname[HEXDIR_LEN]; - status = nfs4_make_rec_clidname(dname, &clname); - if (status) - return status; - /* Cases below refer to rfc 3530 section 14.2.33: */ nfs4_lock_state(); conf = find_confirmed_client_by_name(&clname); @@ -2263,7 +2251,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (unconf) expire_client(unconf); status = nfserr_jukebox; - new = create_client(clname, dname, rqstp, &clverifier); + new = create_client(clname, rqstp, &clverifier); if (new == NULL) goto out; if (conf && same_verf(&conf->cl_verifier, &clverifier)) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6c342bd806e5..029217ad2cb0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -238,7 +238,6 @@ struct nfs4_client { struct list_head cl_delegations; struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ - char cl_recdir[HEXDIR_LEN]; /* recovery dir */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ struct sockaddr_storage cl_addr; /* client ipaddress */ @@ -482,7 +481,6 @@ extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); -extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name); extern bool nfs4_has_reclaimed_state(const char *name); extern void release_session_client(struct nfsd4_session *); -- cgit v1.2.1 From 7e4f015d815d04d888d434889dd3bbb4c210511a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Nov 2012 15:00:58 -0500 Subject: nfsd: release the legacy reclaimable clients list in grace_done The current code holds on to this list until nfsd is shut down, but it's never touched once the grace period ends. Release that memory back into the wild when the grace period ends. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 80e77cc14250..b03b6aa7a6a0 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -404,6 +404,7 @@ nfsd4_recdir_purge_old(struct net *net, time_t boot_time) vfs_fsync(rec_file, 0); mnt_drop_write_file(rec_file); out: + nfs4_release_reclaim(); if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %s\n", rec_file->f_path.dentry->d_name.name); -- cgit v1.2.1 From c6af8803cd4f56aa62a47448c55030d4905b6783 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Nov 2012 23:51:02 -0500 Subject: ext4: don't verify checksums of dx non-leaf nodes during fallback scan During a directory entry lookup of a hashed directory, if the hash-based lookup functions fail and we fall back to a linear scan, don't try to verify the dirent checksum on the internal nodes of the hash tree because they don't store a checksum in a hidden dirent like the leaf nodes do. Reported-by: George Spelvin Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 580af3dfc0eb..88e9a2c7e328 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1146,6 +1146,21 @@ static inline int search_dirblock(struct buffer_head *bh, return 0; } +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} /* * ext4_find_entry() @@ -1246,6 +1261,8 @@ restart: goto next; } if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { EXT4_ERROR_INODE(dir, "checksumming directory " -- cgit v1.2.1 From fa731fc4e045a801814547188a63c2cd49a4cfe6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 13 Nov 2012 09:50:28 +0000 Subject: GFS2: Fix truncation of journaled data files This patch fixes an issue relating to not having enough revokes available when truncating journaled data files. In order to ensure that we do no run out, the truncation is broken into separate pieces if it is large enough. Tested using fsx on a journaled data file. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index de70e52caf3a..a68e91bcef3d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -991,6 +991,41 @@ unlock: return err; } +/** + * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files + * @inode: The inode being truncated + * @oldsize: The original (larger) size + * @newsize: The new smaller size + * + * With jdata files, we have to journal a revoke for each block which is + * truncated. As a result, we need to split this into separate transactions + * if the number of pages being truncated gets too large. + */ + +#define GFS2_JTRUNC_REVOKES 8192 + +static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + u64 chunk; + int error; + + while (oldsize != newsize) { + chunk = oldsize - newsize; + if (chunk > max_chunk) + chunk = max_chunk; + truncate_pagecache(inode, oldsize, oldsize - chunk); + oldsize -= chunk; + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + + return 0; +} + static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) int journaled = gfs2_is_jdata(ip); int error; - error = gfs2_trans_begin(sdp, - RES_DINODE + (journaled ? RES_JDATA : 0), 0); + if (journaled) + error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; @@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); - truncate_pagecache(inode, oldsize, newsize); + if (journaled) + error = gfs2_journaled_truncate(inode, oldsize, newsize); + else + truncate_pagecache(inode, oldsize, newsize); + + if (error) { + brelse(dibh); + return error; + } + out_brelse: brelse(dibh); out: -- cgit v1.2.1 From 343cd8f0d78515da38e41e9351f5ba306cdec84a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Nov 2012 13:04:54 -0500 Subject: GFS2: Use dirty_inode in gfs2_dir_add This patch changes the gfs2_dir_add function so that it uses the dirty_inode function (via mark_inode_dirty) rather than manually updating the dinode. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 259b088cfc4c..9a35670fdc38 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, be16_add_cpu(&leaf->lf_entries, 1); } brelse(bh); - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - break; - gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_entries++; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); - gfs2_dinode_out(ip, bh->b_data); - brelse(bh); + mark_inode_dirty(inode); error = 0; break; } -- cgit v1.2.1 From 4327a9bf71f4b021b675e01f24fefc647cff7513 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Nov 2012 13:03:29 -0500 Subject: GFS2: Eliminate redundant buffer_head manipulation in gfs2_unlink_inode Since we now have a dirty_inode that takes care of manipulating the inode buffer and writing from the inode to the buffer, we can eliminate some unnecessary buffer manipulations in gfs2_unlink_inode that are now redundant. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index ef3ce00bb528..e321333f0b4c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -995,7 +995,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it * @dip: The parent directory * @name: The name of the entry in the parent directory - * @bh: The inode buffer for the inode to be removed * @inode: The inode to be removed * * Called with all the locks and in a transaction. This will only be @@ -1005,8 +1004,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, */ static int gfs2_unlink_inode(struct gfs2_inode *dip, - const struct dentry *dentry, - struct buffer_head *bh) + const struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); @@ -1046,7 +1044,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct gfs2_sbd *sdp = GFS2_SB(dir); struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; int error; @@ -1094,15 +1091,10 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) goto out_gunlock; error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_gunlock; - - error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_end_trans; - error = gfs2_unlink_inode(dip, dentry, bh); - brelse(bh); + error = gfs2_unlink_inode(dip, dentry); out_end_trans: gfs2_trans_end(sdp); @@ -1402,14 +1394,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Remove the target file, if it exists */ - if (nip) { - struct buffer_head *bh; - error = gfs2_meta_inode_buffer(nip, &bh); - if (error) - goto out_end_trans; - error = gfs2_unlink_inode(ndip, ndentry, bh); - brelse(bh); - } + if (nip) + error = gfs2_unlink_inode(ndip, ndentry); if (dir_rename) { error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); -- cgit v1.2.1 From aa8920c96897dd82f0520f9e7db7311b42547ce6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 13 Nov 2012 14:50:35 +0000 Subject: GFS2: Fix one RG corner case For filesystems with only a single resource group, we need to be careful that the allocation loop will not land up with a NULL resource group. This fixes a bug in a previous patch where the gfs2_rgrpd_get_next() function was being used instead of gfs2_rgrpd_get_first() Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 99a619788c65..5625e93bf61f 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1776,10 +1776,11 @@ static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; + struct gfs2_sbd *sdp = rgd->rd_sbd; rgd = gfs2_rgrpd_get_next(rgd); if (rgd == NULL) - rgd = gfs2_rgrpd_get_next(NULL); + rgd = gfs2_rgrpd_get_first(sdp); *pos = rgd; if (rgd != begin) /* If we didn't wrap */ return true; -- cgit v1.2.1 From 07428d7f0ca46087f7f1efa895322bb9dc1ac21d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:44 +1100 Subject: xfs: fix attr tree double split corruption In certain circumstances, a double split of an attribute tree is needed to insert or replace an attribute. In rare situations, this can go wrong, leaving the attribute tree corrupted. In this case, the attr being replaced is the last attr in a leaf node, and the replacement is larger so doesn't fit in the same leaf node. When we have the initial condition of a node format attribute btree with two leaves at index 1 and 2. Call them L1 and L2. The leaf L1 is completely full, there is not a single byte of free space in it. L2 is mostly empty. The attribute being replaced - call it X - is the last attribute in L1. The way an attribute replace is executed is that the replacement attribute - call it Y - is first inserted into the tree, but has an INCOMPLETE flag set on it so that list traversals ignore it. Once this transaction is committed, a second transaction it run to atomically mark Y as COMPLETE and X as INCOMPLETE, so that a traversal will now find Y and skip X. Once that transaction is committed, attribute X is then removed. So, the initial condition is: +--------+ +--------+ | L1 | | L2 | | fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 | | fsp: 0 | | fsp: N | |--------| |--------| | attr A | | attr 1 | |--------| |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr X | | attr n | +--------+ +--------+ So now we go to replace X, and see that L1:fsp = 0 - it is full so we can't insert Y in the same leaf. So we record the the location of attribute X so we can track it for later use, then we split L1 into L1 and L3 and reblance across the two leafs. We end with: +--------+ +--------+ +--------+ | L1 | | L3 | | L2 | | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: N | |--------| |--------| |--------| | attr A | | attr X | | attr 1 | |--------| +--------+ |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And we track that the original attribute is now at L3:0. We then try to insert Y into L1 again, and find that there isn't enough room because the new attribute is larger than the old one. Hence we have to split again to make room for Y. We end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| + INCOMP + +--------+ |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And now we have the new (incomplete) attribute @ L4:0, and the original attribute at L3:0. At this point, the first transaction is committed, and we move to the flipping of the flags. This is where we are supposed to end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| +--------+ + INCOMP + |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ But that doesn't happen properly - the attribute tracking indexes are not pointing to the right locations. What we end up with is both the old attribute to be removed pointing at L4:0 and the new attribute at L4:1. On a debug kernel, this assert fails like so: XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725 because the new attribute location does not exist. On a production kernel, this goes unnoticed and the code proceeds ahead merrily and removes L4 because it thinks that is the block that is no longer needed. This leaves the hash index node pointing to entries L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free space, and so everything is busted. This corruption is caused by the removal of the old attribute triggering a join - it joins everything correctly but then frees the wrong block. xfs_repair will report something like: bad sibling back pointer for block 4 in attribute fork for inode 131 problem with attribute contents in inode 131 would clear attr fork bad nblocks 8 for inode 131, would reset to 3 bad anextents 4 for inode 131, would reset to 0 The problem lies in the assignment of the old/new blocks for tracking purposes when the double leaf split occurs. The first split tries to place the new attribute inside the current leaf (i.e. "inleaf == true") and moves the old attribute (X) to the new block. This sets up the old block/index to L1:X, and newly allocated block to L3:0. It then moves attr X to the new block and tries to insert attr Y at the old index. That fails, so it splits again. With the second split, the rebalance ends up placing the new attr in the second new block - L4:0 - and this is where the code goes wrong. What is does is it sets both the new and old block index to the second new block. Hence it inserts attr Y at the right place (L4:0) but overwrites the current location of the attr to replace that is held in the new block index (currently L3:0). It over writes it with L4:1 - the index we later assert fail on. Hopefully this table will show this in a foramt that is a bit easier to understand: Split old attr index new attr index vanilla patched vanilla patched before 1st L1:26 L1:26 N/A N/A after 1st L3:0 L3:0 L1:26 L1:26 after 2nd L4:0 L3:0 L4:1 L4:0 ^^^^ ^^^^ wrong wrong The fix is surprisingly simple, for all this analysis - just stop the rebalance on the out-of leaf case from overwriting the new attr index - it's already correct for the double split case. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca738..70eec1829776 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); -- cgit v1.2.1 From 7bf7f352194252e6f05981d44fb8cb55668606cd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..e57e2daa357c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; -- cgit v1.2.1 From 37eb17e604ac7398bbb133c82f281475d704fff7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..4b0b8dd1b7b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } -- cgit v1.2.1 From ee73259b401317117e7f5d4834c270b10b12bc8e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:53 +1100 Subject: xfs: add more attribute tree trace points. Added when debugging recent attribute tree problems to more finely trace code execution through the maze of twisty passages that makes up the attr code. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 18 +++++++++++++++++ fs/xfs/xfs_attr_leaf.c | 37 ++++++++++++++++++++-------------- fs/xfs/xfs_da_btree.c | 6 ++++++ fs/xfs/xfs_trace.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 99 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 0ca1f0be62d2..55bbe98e8f82 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1155,6 +1155,8 @@ xfs_attr_leaf_get(xfs_da_args_t *args) struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, XFS_ATTR_FORK); @@ -1185,6 +1187,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) int error; struct xfs_buf *bp; + trace_xfs_attr_leaf_list(context); + context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) @@ -1653,6 +1657,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1699,6 +1705,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1755,6 +1763,8 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; @@ -1804,6 +1814,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; + trace_xfs_attr_node_list(context); + cursor = context->cursor; cursor->initted = 1; @@ -1959,6 +1971,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; + trace_xfs_attr_rmtval_get(args); + ASSERT(!(args->flags & ATTR_KERNOVAL)); mp = args->dp->i_mount; @@ -2014,6 +2028,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; + trace_xfs_attr_rmtval_set(args); + dp = args->dp; mp = dp->i_mount; src = args->value; @@ -2143,6 +2159,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) xfs_dablk_t lblkno; int valuelen, blkcnt, nmap, error, done, committed; + trace_xfs_attr_rmtval_remove(args); + mp = args->dp->i_mount; /* diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 70eec1829776..4bfc732bc9c9 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); +STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, + struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -1071,7 +1072,7 @@ xfs_attr_leaf_add( * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr_leaf_compact(args, bp); /* * After compaction, the block is guaranteed to have only one @@ -1102,6 +1103,8 @@ xfs_attr_leaf_add_work( xfs_mount_t *mp; int tmp, i; + trace_xfs_attr_leaf_add_work(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; @@ -1214,15 +1217,17 @@ xfs_attr_leaf_add_work( */ STATIC void xfs_attr_leaf_compact( - struct xfs_trans *trans, - struct xfs_buf *bp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_trans *trans = args->trans; + struct xfs_mount *mp = trans->t_mountp; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); - mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); @@ -1345,9 +1350,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr2->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. @@ -1378,9 +1382,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr1->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. @@ -1577,6 +1580,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_attr_leaf_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -1702,6 +1707,8 @@ xfs_attr_leaf_remove( int tablesize, tmp, i; xfs_mount_t *mp; + trace_xfs_attr_leaf_remove(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7bfb7dd334fc..c62e7e6ff50e 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -779,6 +779,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_da_node_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -900,6 +902,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) xfs_dahash_t lasthash=0; int level, count; + trace_xfs_da_fixhashpath(state->args); + level = path->active-1; blk = &path->blk[ level ]; switch (blk->magic) { @@ -1417,6 +1421,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, xfs_dablk_t blkno=0; int level, error; + trace_xfs_da_path_shift(state->args); + /* * Roll up the Btree looking for the first block where our * current index is not at the edge of the block. Note that diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cb5234632072..2e137d4a85ae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); DECLARE_EVENT_CLASS(xfs_perag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, @@ -1502,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + #define DEFINE_ATTR_EVENT(name) \ -DEFINE_EVENT(xfs_da_class, name, \ +DEFINE_EVENT(xfs_attr_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_ATTR_EVENT(xfs_attr_sf_add); @@ -1517,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); DEFINE_ATTR_EVENT(xfs_attr_leaf_split); DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); @@ -1532,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ @@ -1556,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split); DEFINE_DA_EVENT(xfs_da_node_remove); DEFINE_DA_EVENT(xfs_da_node_rebalance); DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); DEFINE_DA_EVENT(xfs_da_swap_lastblock); DEFINE_DA_EVENT(xfs_da_grow_inode); DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), -- cgit v1.2.1 From b64f3a390d3477517cbff7d613e551705540769b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 13 Nov 2012 16:40:27 -0600 Subject: xfs: use btree block initialisation functions in growfs Factor xfs_btree_init_block() to be independent of the btree cursor, and use the function to initialise btree blocks in the growfs code. This makes adding support for different format btree blocks simple. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_btree.c | 33 ++++++++++++++++++++++++--------- fs/xfs/xfs_btree.h | 11 +++++++++++ fs/xfs/xfs_fsops.c | 37 +++++++++++++------------------------ 3 files changed, 48 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e53e317b1582..121ea99e615a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -853,18 +853,22 @@ xfs_btree_set_sibling( } } -STATIC void +void xfs_btree_init_block( - struct xfs_btree_cur *cur, - int level, - int numrecs, - struct xfs_btree_block *new) /* new block */ + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags) { - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); + + new->bb_magic = cpu_to_be32(magic); new->bb_level = cpu_to_be16(level); new->bb_numrecs = cpu_to_be16(numrecs); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (flags & XFS_BTREE_LONG_PTRS) { new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { @@ -873,6 +877,17 @@ xfs_btree_init_block( } } +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_buf *bp) +{ + xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], + level, numrecs, cur->bc_flags); +} + /* * Return true if ptr is the last record in the btree and * we need to track updateÑ• to this record. The decision @@ -2183,7 +2198,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); /* * Split the entries between the old and the new block evenly. @@ -2492,7 +2507,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 5b240de104c0..c9cf2d00e236 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -378,6 +378,17 @@ xfs_btree_reada_bufs( xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count); /* count of filesystem blocks */ +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags); /* * Common btree core entry points. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7b0a997cf62b..a5034af35db7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -125,7 +125,6 @@ xfs_growfs_data_private( xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -263,17 +262,14 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -289,18 +285,15 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,13 +309,9 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) -- cgit v1.2.1 From fd23683c3b1ab905cba61ea2981c156f4bf52845 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:59 +1100 Subject: xfs: growfs: use uncached buffers for new headers When writing the new AG headers to disk, we can't attach write verifiers because they have a dependency on the struct xfs-perag being attached to the buffer to be fully initialised and growfs can't fully initialise them until later in the process. The simplest way to avoid this problem is to use uncached buffers for writing the new headers. These buffers don't have the xfs-perag attached to them, so it's simple to detect in the write verifier and be able to skip the checks that need the xfs-perag. This enables us to attach the appropriate buffer ops to the buffer and hence calculate CRCs on the way to disk. IT also means that the buffer is torn down immediately, and so the first access to the AG headers will re-read the header from disk and perform full verification of the buffer. This way we also can catch corruptions due to problems that went undetected in growfs. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 63 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a5034af35db7..2196830bf5c0 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -114,6 +114,26 @@ xfs_fs_geometry( return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ @@ -189,15 +209,15 @@ xfs_growfs_data_private( /* * AG freelist header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -226,15 +246,15 @@ xfs_growfs_data_private( /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -255,16 +275,16 @@ xfs_growfs_data_private( /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -278,16 +298,15 @@ xfs_growfs_data_private( /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -302,14 +321,14 @@ xfs_growfs_data_private( /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); error = xfs_bwrite(bp); -- cgit v1.2.1 From de497688daaabbab425a8a969528272ec1d962a6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:00 +1100 Subject: xfs: make growfs initialise the AGFL header For verification purposes, AGFLs need to be initialised to a known set of values. For upcoming CRC changes, they are also headers that need to be initialised. Currently, growfs does neither for the AGFLs - it ignores them completely. Add initialisation of the AGFL to be full of invalid block numbers (NULLAGBLOCK) to put the infrastructure in place needed for CRC support. Includes a comment clarification from Jeff Liu. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2196830bf5c0..bd9cb7f0b073 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -140,6 +140,7 @@ xfs_growfs_data_private( xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; @@ -207,7 +208,7 @@ xfs_growfs_data_private( nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* - * AG freelist header block + * AG freespace header block */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), @@ -243,6 +244,26 @@ xfs_growfs_data_private( if (error) goto error0; + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) { + error = ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + /* * AG inode header block */ -- cgit v1.2.1 From fb6791d100d1bba20b5cdbc4912e1f7086ec60f8 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 13 Nov 2012 10:58:56 -0500 Subject: GFS2: skip dlm_unlock calls in unmount When unmounting, gfs2 does a full dlm_unlock operation on every cached lock. This can create a very large amount of work and can take a long time to complete. However, the vast majority of these dlm unlock operations are unnecessary because after all the unlocks are done, gfs2 leaves the dlm lockspace, which automatically clears the locks of the leaving node, without unlocking each one individually. So, gfs2 can skip explicit dlm unlocks, and use dlm_release_lockspace to remove the locks implicitly. The one exception is when the lock's lvb is being used. In this case, dlm_unlock is called because it may update the lvb of the resource. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/lock_dlm.c | 8 ++++++++ 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6114571a979a..9d29a5167d34 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1526,6 +1526,7 @@ static void dump_glock_func(struct gfs2_glock *gl) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a46f03485936..a35ef5cd1480 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -539,6 +539,7 @@ enum { SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ + SDF_SKIP_DLM_UNLOCK = 8, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0fb6539b0c8c..f6504d3fadb3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -289,6 +289,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); + + /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && + gl->gl_state != LM_ST_EXCLUSIVE) { + gfs2_glock_free(gl); + return; + } + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { -- cgit v1.2.1 From 135ae8270d233067c8be426e2a59d0733ba74723 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sat, 10 Nov 2012 07:20:25 -0500 Subject: nfsd4: init_session should be declared static Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 37b19f7948ed..7de9ba00a718 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -947,7 +947,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) return new; } -void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) +static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; -- cgit v1.2.1 From 2b4cf668a7b8f84182a35f07152d8b6f012629d2 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 13 Nov 2012 15:41:27 -0500 Subject: nfsd4: get_backchannel_cred should be static Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7bb187ac1492..996847023015 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -642,7 +642,7 @@ int set_callback_cred(void) return 0; } -struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) +static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { return get_rpccred(callback_cred); -- cgit v1.2.1 From f5b8911b67eb4f15d95d5e5324d376d4a49d56e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:42:47 +1100 Subject: xfs: remove xfs_tosspages It's a buggy, unnecessary wrapper that is duplicating truncate_pagecache_range(). When replacing the call in xfs_change_file_space(), also ensure that the length being allocated/freed is always positive before making any changes. These checks are done in the lower extent manipulation functions, too, but we need to do them before any page cache operations. Reported-by: Andrew Dahl Signed-off-by: Dave Chinner Reviewed-By: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 3 +-- fs/xfs/xfs_fs_subr.c | 12 ------------ fs/xfs/xfs_vnodeops.c | 30 +++++++++++++++++++++++++----- fs/xfs/xfs_vnodeops.h | 2 -- 4 files changed, 26 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b9b8646e62db..b2c63a28afa7 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -315,8 +315,7 @@ xfs_swap_extents( * are safe. We don't really care if non-io related * fields change. */ - - xfs_tosspages(ip, 0, -1, FI_REMAPF); + truncate_pagecache_range(VFS_I(ip), 0, -1); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 652b875a9d4c..d49de3d70456 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -25,18 +25,6 @@ * note: all filemap functions return negative error codes. These * need to be inverted before returning to the xfs core functions. */ -void -xfs_tosspages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - /* can't toss partial tail pages, so mask them out */ - last &= ~(PAGE_SIZE - 1); - truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); -} - int xfs_flushinval_pages( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c2ddd7a43942..de3702a57e55 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2118,7 +2118,7 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t llen; + xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; int prealloc_type; @@ -2139,12 +2139,30 @@ xfs_change_file_space( return XFS_ERROR(EINVAL); } - llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) + return XFS_ERROR(EINVAL); + break; + default: + bf->l_len = 0; + break; + } if (bf->l_start < 0 || bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + llen < 0 || - bf->l_start + llen > mp->m_super->s_maxbytes) + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; @@ -2169,7 +2187,9 @@ xfs_change_file_space( switch (cmd) { case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; - xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); + end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; + if (startoffset > end) + truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 52fafc416a0c..d48141d6bc3b 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, -- cgit v1.2.1 From d6638ae244f6323fcdf85e72eb4a5af6f6212893 Mon Sep 17 00:00:00 2001 From: Andrew Dahl Date: Wed, 14 Nov 2012 12:52:26 -0600 Subject: xfs: reverse the check on XFS_IOC_ZERO_RANGE Reversing the check on XFS_IOC_ZERO_RANGE. Range should be zeroed if the start is less than or equal to the end. Signed-off-by: Andrew Dahl Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de3702a57e55..46a7a5de5d6d 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2188,7 +2188,7 @@ xfs_change_file_space( case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset > end) + if (startoffset <= end) truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: -- cgit v1.2.1 From 95eacf0f71b7682a05b8242c49c68e8e4bb673e3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:55 +1100 Subject: xfs: remove xfs_wait_on_pages() It's just a simple wrapper around a VFS function that is only called by another function in xfs_fs_subr.c. Remove it and call the VFS function directly. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_fs_subr.c | 18 ++---------------- fs/xfs/xfs_vnodeops.h | 1 - 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index d49de3d70456..33658234dfc5 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -62,23 +62,9 @@ xfs_flush_pages( last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; - ret2 = xfs_wait_on_pages(ip, first, last); + ret2 = -filemap_fdatawait_range(mapping, first, + last == -1 ? XFS_ISIZE(ip) - 1 : last); if (!ret) ret = ret2; return ret; } - -int -xfs_wait_on_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { - return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - } - return 0; -} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index d48141d6bc3b..c8ad48b61a25 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -52,7 +52,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); -int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.1 From 4bc1ea6b8ddd4f2bd78944fbe5a1042ac14b1f5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:56 +1100 Subject: xfs: remove xfs_flush_pages It is a complex wrapper around VFS functions, but there are VFS functions that provide exactly the same functionality. Call the VFS functions directly and remove the unnecessary indirection and complexity. We don't need to care about clearing the XFS_ITRUNCATED flag, as that is done during .writepages. Hence is cleared by the VFS writeback path if there is anything to write back during the flush. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_fs_subr.c | 24 ------------------------ fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_vnodeops.c | 7 +++++-- fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 9 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2daa357c..71361da1f77c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1641,7 +1641,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 83d0cf3df930..a60f3d1f151c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5599,7 +5599,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 33658234dfc5..b5380893728e 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -44,27 +44,3 @@ xfs_flushinval_pages( truncate_inode_pages_range(mapping, first, last); return -ret; } - -int -xfs_flush_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - int ret2; - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (flags & XBF_ASYNC) - return ret; - ret2 = -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - if (!ret) - ret = ret2; - return ret; -} diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f5c4953287..d82efaa2ac73 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -780,8 +780,8 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, - FI_NONE); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 46a7a5de5d6d..c00326afa7bf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -428,8 +428,11 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } } } diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index c8ad48b61a25..73cb3cb15f75 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -50,8 +50,6 @@ int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); -int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, uint64_t flags, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.1 From fb59581404ab7ec5075299065c22cb211a9262a9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:57 +1100 Subject: xfs: remove xfs_flushinval_pages It's just a simple wrapper around VFS functionality, and is actually bugging in that it doesn't remove mappings before invalidating the page cache. Remove it and replace it with the correct VFS functionality. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_dfrag.c | 10 ++++------ fs/xfs/xfs_file.c | 23 ++++++++++++----------- fs/xfs/xfs_fs_subr.c | 46 ---------------------------------------------- fs/xfs/xfs_vnodeops.c | 11 +++++------ fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 21 insertions(+), 72 deletions(-) delete mode 100644 fs/xfs/xfs_fs_subr.c (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e65357bb3dc6..d02201df855b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,7 +37,6 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ xfs_ioctl.o \ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b2c63a28afa7..d0e9c74d3d96 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,12 +246,10 @@ xfs_swap_extents( goto out_unlock; } - if (VN_CACHED(VFS_I(tip)) != 0) { - error = xfs_flushinval_pages(tip, 0, -1, - FI_REMAPF_LOCKED); - if (error) - goto out_unlock; - } + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(ip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index daf4066c24b2..c42f99e71f14 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -255,15 +255,14 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((iocb->ki_pos & target->bt_smask) || - (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if ((pos & target->bt_smask) || (size & target->bt_smask)) { + if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } - n = mp->m_super->s_maxbytes - iocb->ki_pos; + n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; @@ -289,20 +288,21 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, - (iocb->ki_pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } + truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -670,10 +670,11 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, - FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); if (ret) goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); } /* diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c deleted file mode 100644 index b5380893728e..000000000000 --- a/fs/xfs/xfs_fs_subr.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_trace.h" - -/* - * note: all filemap functions return negative error codes. These - * need to be inverted before returning to the xfs core functions. - */ -int -xfs_flushinval_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - - trace_xfs_pagecache_inval(ip, first, last); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (!ret) - truncate_inode_pages_range(mapping, first, last); - return -ret; -} diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c00326afa7bf..81c61fd17890 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1958,12 +1958,11 @@ xfs_free_file_space( rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - - if (VN_CACHED(VFS_I(ip)) != 0) { - error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_iolock; - } + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out_unlock_iolock; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); /* * Need to zero the stuff we're not freeing, on disk. diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 73cb3cb15f75..91a03fa3814f 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.1 From 70a6f46d7b0ec03653b9ab3f8063a9717a4a53ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 14 Nov 2012 11:49:53 +0000 Subject: pstore: Fix NULL pointer dereference in console writes Passing a NULL id causes a NULL pointer deference in writers such as erst_writer and efi_pstore_write because they expect to update this id. Pass a dummy id instead. This avoids a cascade of oopses caused when the initial pstore_console_write passes a null which in turn causes writes to the console causing further oopses in subsequent pstore_console_write calls. Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Anton Vorontsov --- fs/pstore/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index a40da07e93d6..947fbe06c3b1 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) while (s < e) { unsigned long flags; + u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; -- cgit v1.2.1 From 66bea92c69477a75a5d37b9bfed5773c92a3c4b4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 14 Nov 2012 22:22:05 -0500 Subject: ext4: init pagevec in ext4_da_block_invalidatepages ext4_da_block_invalidatepages is missing a pagevec_init(), which means that pvec->cold contains random garbage. This affects whether the page goes to the front or back of the LRU when ->cold makes it to free_hot_cold_page() Reviewed-by: Lukas Czerner Reviewed-by: Carlos Maiolino Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7f9ccc1381a9..52f7ff2f2e7e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1482,6 +1482,7 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); ext4_es_remove_extent(inode, start, last - start + 1); + pagevec_init(&pvec, 0); while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) -- cgit v1.2.1 From 45634cd8cb6541523227753944c7417ac3d20f94 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 16:18:52 -0800 Subject: userns: Support autofs4 interacing with multiple user namespaces Use kuid_t and kgid_t in struct autofs_info and struct autofs_wait_queue. When creating directories and symlinks default the uid and gid of the mount requester to the global root uid and gid. autofs4_wait will update these fields when a mount is requested. When generating autofsv5 packets report the uid and gid of the mount requestor in user namespace of the process that opened the pipe, reporting unmapped uids and gids as overflowuid and overflowgid. In autofs_dev_ioctl_requester return the uid and gid of the last mount requester converted into the calling processes user namespace. When the uid or gid don't map return overflowuid and overflowgid as appropriate, allowing failure to find a mount requester to be distinguished from failure to map a mount requester. The uid and gid mount options specifying the user and group of the root autofs inode are converted into kuid and kgid as they are parsed defaulting to the current uid and current gid of the process that mounts autofs. Mounting of autofs for the present remains confined to processes in the initial user namespace. Cc: Ian Kent Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/autofs4/autofs_i.h | 8 ++++---- fs/autofs4/dev-ioctl.c | 4 ++-- fs/autofs4/inode.c | 24 +++++++++++++++--------- fs/autofs4/waitq.c | 5 +++-- 4 files changed, 24 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 908e18455413..b785e7707959 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_info { unsigned long last_used; atomic_t count; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ @@ -89,8 +89,8 @@ struct autofs_wait_queue { struct qstr name; u32 dev; u64 ino; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; pid_t pid; pid_t tgid; /* This is for status reporting upon return */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index a16214109d31..9f68a37bb2b2 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry); spin_lock(&sbi->fs_lock); - param->requester.uid = ino->uid; - param->requester.gid = ino->gid; + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 8a4fed8ead30..b104726e2d0a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) void autofs4_clean_ino(struct autofs_info *ino) { - ino->uid = 0; - ino->gid = 0; + ino->uid = GLOBAL_ROOT_UID; + ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } @@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); - if (root_inode->i_uid != 0) - seq_printf(m, ",uid=%u", root_inode->i_uid); - if (root_inode->i_gid != 0) - seq_printf(m, ",gid=%u", root_inode->i_gid); + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, root_inode->i_uid)); + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); @@ -126,7 +128,7 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, +static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) { char *p; @@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, case Opt_uid: if (match_int(args, &option)) return 1; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 1; break; case Opt_gid: if (match_int(args, &option)) return 1; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 1; break; case Opt_pgrp: if (match_int(args, &option)) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index dce436e595c1..03bc1d347d8e 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); @@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; - packet->uid = wq->uid; - packet->gid = wq->gid; + packet->uid = from_kuid_munged(user_ns, wq->uid); + packet->gid = from_kgid_munged(user_ns, wq->gid); packet->pid = wq->pid; packet->tgid = wq->tgid; break; -- cgit v1.2.1 From 499dcf2024092e5cce41d05599a5b51d1f92031a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 16:26:03 -0800 Subject: userns: Support fuse interacting with multiple user namespaces Use kuid_t and kgid_t in struct fuse_conn and struct fuse_mount_data. The connection between between a fuse filesystem and a fuse daemon is established when a fuse filesystem is mounted and provided with a file descriptor the fuse daemon created by opening /dev/fuse. For now restrict the communication of uids and gids between the fuse filesystem and the fuse daemon to the initial user namespace. Enforce this by verifying the file descriptor passed to the mount of fuse was opened in the initial user namespace. Ensuring the mount happens in the initial user namespace is not necessary as mounts from non-initial user namespaces are not yet allowed. In fuse_req_init_context convert the currrent fsuid and fsgid into the initial user namespace for the request that will be sent to the fuse daemon. In fuse_fill_attr convert the uid and gid passed from the fuse daemon from the initial user namespace into kuids and kgids. In iattr_to_fattr called from fuse_setattr convert kuids and kgids into the uids and gids in the initial user namespace before passing them to the fuse filesystem. In fuse_change_attributes_common called from fuse_dentry_revalidate, fuse_permission, fuse_geattr, and fuse_setattr, and fuse_iget convert the uid and gid from the fuse daemon into a kuid and a kgid to store on the fuse inode. By default fuse mounts are restricted to task whose uid, suid, and euid matches the fuse user_id and whose gid, sgid, and egid matches the fuse group id. Convert the user_id and group_id mount options into kuids and kgids at mount time, and use uid_eq and gid_eq to compare the in fuse_allow_task. Cc: Miklos Szeredi Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/fuse/dev.c | 4 ++-- fs/fuse/dir.c | 20 ++++++++++---------- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/inode.c | 23 ++++++++++++++--------- 4 files changed, 28 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8c23fa7a91e6..c16335315e5d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req) static void fuse_req_init_context(struct fuse_req *req) { - req->in.h.uid = current_fsuid(); - req->in.h.gid = current_fsgid(); + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); req->in.h.pid = current->pid; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 324bc0850534..b7c09f9eb40c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = attr->uid; - stat->gid = attr->gid; + stat->uid = make_kuid(&init_user_ns, attr->uid); + stat->gid = make_kgid(&init_user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) rcu_read_lock(); ret = 0; cred = __task_cred(task); - if (cred->euid == fc->user_id && - cred->suid == fc->user_id && - cred->uid == fc->user_id && - cred->egid == fc->group_id && - cred->sgid == fc->group_id && - cred->gid == fc->group_id) + if (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)) ret = 1; rcu_read_unlock(); @@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; + arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; + arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e24dd74e3068..e105a53fc72d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -333,10 +333,10 @@ struct fuse_conn { atomic_t count; /** The user id for this mount */ - uid_t user_id; + kuid_t user_id; /** The group id for this mount */ - gid_t group_id; + kgid_t group_id; /** The fuse mount flags for this mount */ unsigned flags; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f0eda124cffb..73ca6b72beaf 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh, struct fuse_mount_data { int fd; unsigned rootmode; - unsigned user_id; - unsigned group_id; + kuid_t user_id; + kgid_t group_id; unsigned fd_present:1; unsigned rootmode_present:1; unsigned user_id_present:1; @@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = attr->uid; - inode->i_gid = attr->gid; + inode->i_uid = make_kuid(&init_user_ns, attr->uid); + inode->i_gid = make_kgid(&init_user_ns, attr->gid); inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (match_int(&args[0], &value)) return 0; - d->user_id = value; + d->user_id = make_kuid(current_user_ns(), value); + if (!uid_valid(d->user_id)) + return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: if (match_int(&args[0], &value)) return 0; - d->group_id = value; + d->group_id = make_kgid(current_user_ns(), value); + if (!gid_valid(d->group_id)) + return 0; d->group_id_present = 1; break; @@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", fc->user_id); - seq_printf(m, ",group_id=%u", fc->group_id); + seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); if (fc->flags & FUSE_DEFAULT_PERMISSIONS) seq_puts(m, ",default_permissions"); if (fc->flags & FUSE_ALLOW_OTHER) @@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if (file->f_op != &fuse_dev_operations) + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != &init_user_ns)) goto err_fput; fc = kmalloc(sizeof(*fc), GFP_KERNEL); -- cgit v1.2.1 From dba2d70c5dc520fdb569d1fd8dbd45c0e330253e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 14 Nov 2012 13:46:53 -0500 Subject: GFS2: only use lvb on glocks that need it Save the effort of allocating, reading and writing the lvb for most glocks that do not use it. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 27 +++++++++++++++++++++------ fs/gfs2/glops.c | 3 ++- fs/gfs2/incore.h | 3 ++- fs/gfs2/lock_dlm.c | 12 +++++++----- 4 files changed, 32 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9d29a5167d34..2284de4d05ce 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -105,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); - if (gl->gl_ops->go_flags & GLOF_ASPACE) + if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); - else + } else { + kfree(gl->gl_lvb); kmem_cache_free(gfs2_glock_cachep, gl); + } } void gfs2_glock_free(struct gfs2_glock *gl) @@ -545,7 +547,10 @@ __acquires(&gl->gl_spin) if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); - GLOCK_BUG_ON(gl, ret); + if (ret) { + printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, 1); + } } else { /* lock_nolock */ finish_xmote(gl, target); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -734,6 +739,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!gl) return -ENOMEM; + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_lvb = NULL; + + if (glops->go_flags & GLOF_LVB) { + gl->gl_lvb = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + if (!gl->gl_lvb) { + kmem_cache_free(cachep, gl); + return -ENOMEM; + } + gl->gl_lksb.sb_lvbptr = gl->gl_lvb; + } + atomic_inc(&sdp->sd_glock_disposal); gl->gl_sbd = sdp; gl->gl_flags = 0; @@ -751,9 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, preempt_enable(); gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; - memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); - memset(gl->gl_lvb, 0, 32 * sizeof(char)); - gl->gl_lksb.sb_lvbptr = gl->gl_lvb; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; @@ -775,6 +789,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, tmp = search_bucket(hash, sdp, &name); if (tmp) { spin_unlock_bucket(hash); + kfree(gl->gl_lvb); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); gl = tmp; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e86fe26c12d2..78d4184ffc7d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -552,7 +552,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE, + .go_flags = GLOF_ASPACE | GLOF_LVB, }; const struct gfs2_glock_operations gfs2_trans_glops = { @@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { const struct gfs2_glock_operations gfs2_quota_glops = { .go_type = LM_TYPE_QUOTA, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_journal_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a35ef5cd1480..bd577fc59e0b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -216,6 +216,7 @@ struct gfs2_glock_operations { const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 +#define GLOF_LVB 2 }; enum { @@ -321,7 +322,7 @@ struct gfs2_glock { ktime_t gl_dstamp; struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; - char gl_lvb[32]; + char *gl_lvb; unsigned long gl_tchange; void *gl_object; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index f6504d3fadb3..d28ae37ceb3c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -120,7 +120,7 @@ static void gdlm_ast(void *arg) gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); - if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) + if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID && gl->gl_lvb) memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); switch (gl->gl_lksb.sb_status) { @@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate) static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, const int req) { - u32 lkf = DLM_LKF_VALBLK; - u32 lkid = gl->gl_lksb.sb_lkid; + u32 lkf = 0; + + if (gl->gl_lvb) + lkf |= DLM_LKF_VALBLK; if (gfs_flags & LM_FLAG_TRY) lkf |= DLM_LKF_NOQUEUE; @@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, BUG(); } - if (lkid != 0) { + if (gl->gl_lksb.sb_lkid != 0) { lkf |= DLM_LKF_CONVERT; if (test_bit(GLF_BLOCKING, &gl->gl_flags)) lkf |= DLM_LKF_QUECVT; @@ -292,7 +294,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) /* don't want to skip dlm_unlock writing the lvb when lock is ex */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_state != LM_ST_EXCLUSIVE) { + gl->gl_lvb && gl->gl_state != LM_ST_EXCLUSIVE) { gfs2_glock_free(gl); return; } -- cgit v1.2.1 From 4e2f8849def738092ad6c0fc2b34737381bc9d26 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 14 Nov 2012 13:47:37 -0500 Subject: GFS2: remove redundant lvb pointer The lksb struct already contains a pointer to the lvb, so another directly from the glock struct is not needed. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 ++++------ fs/gfs2/incore.h | 1 - fs/gfs2/lock_dlm.c | 8 ++++---- fs/gfs2/quota.c | 6 +++--- fs/gfs2/rgrp.c | 2 +- 5 files changed, 12 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2284de4d05ce..274b6bed5d67 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -108,7 +108,7 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); } else { - kfree(gl->gl_lvb); + kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } } @@ -740,15 +740,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); - gl->gl_lvb = NULL; if (glops->go_flags & GLOF_LVB) { - gl->gl_lvb = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); - if (!gl->gl_lvb) { + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; } - gl->gl_lksb.sb_lvbptr = gl->gl_lvb; } atomic_inc(&sdp->sd_glock_disposal); @@ -789,7 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, tmp = search_bucket(hash, sdp, &name); if (tmp) { spin_unlock_bucket(hash); - kfree(gl->gl_lvb); + kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); gl = tmp; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index bd577fc59e0b..c373a24fedd9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -322,7 +322,6 @@ struct gfs2_glock { ktime_t gl_dstamp; struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; - char *gl_lvb; unsigned long gl_tchange; void *gl_object; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index d28ae37ceb3c..8dad6b093716 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -120,8 +120,8 @@ static void gdlm_ast(void *arg) gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); - if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID && gl->gl_lvb) - memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); + if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) + memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ @@ -205,7 +205,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, { u32 lkf = 0; - if (gl->gl_lvb) + if (gl->gl_lksb.sb_lvbptr) lkf |= DLM_LKF_VALBLK; if (gfs_flags & LM_FLAG_TRY) @@ -294,7 +294,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) /* don't want to skip dlm_unlock writing the lvb when lock is ex */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lvb && gl->gl_state != LM_ST_EXCLUSIVE) { + gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6bbf64f0f5b6..ae55e248c3b7 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -869,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) if (error < 0) return error; - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); qlvb->__pad = 0; qlvb->qb_limit = q.qu_limit; @@ -893,7 +893,7 @@ restart: if (error) return error; - qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { gfs2_glock_dq_uninit(q_gh); @@ -1506,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out; - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = from_kqid(&init_user_ns, qid); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5625e93bf61f..37ee061d899e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -879,7 +879,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; - rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; -- cgit v1.2.1 From 7f2210fa6b791c290e36d8b3c8af7aaf22b2aaf0 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:05 +0300 Subject: nfsd: use service net instead of hard-coded net where possible Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7de9ba00a718..207b9afbbacf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2281,7 +2281,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; @@ -3151,7 +3151,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_client *clp; __be32 status; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", @@ -4104,7 +4104,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool new_state = false; int lkflg; int err; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, @@ -4277,7 +4277,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock *file_lock = NULL; struct nfs4_lockowner *lo; __be32 status; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; @@ -4453,7 +4453,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct list_head matches; unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); __be32 status; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); -- cgit v1.2.1 From c212cecfa21b3d30cd5cc2389754a46973ad9027 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:10 +0300 Subject: nfsd: make nfs4_client network namespace dependent And use it's net where possible. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 2 +- fs/nfsd/nfs4recover.c | 9 +++------ fs/nfsd/nfs4state.c | 15 +++++++++------ fs/nfsd/state.h | 1 + 4 files changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 996847023015..826cc269c445 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -663,7 +663,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .to_retries = 0, }; struct rpc_create_args args = { - .net = &init_net, + .net = clp->net, .address = (struct sockaddr *) &conn->cb_addr, .addrsize = conn->cb_addrlen, .saddress = (struct sockaddr *) &conn->cb_saddr, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b03b6aa7a6a0..9881bcad264b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -859,8 +859,7 @@ nfsd4_cld_create(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; - /* FIXME: determine net from clp */ - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already stored */ @@ -897,8 +896,7 @@ nfsd4_cld_remove(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; - /* FIXME: determine net from clp */ - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already removed */ @@ -935,8 +933,7 @@ nfsd4_cld_check(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; - /* FIXME: determine net from clp */ - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if one was already stored during this grace pd */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 207b9afbbacf..001bbc99d7a4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1263,10 +1263,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } -static void gen_clid(struct nfs4_client *clp) +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { static u32 current_clientid = 1; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); clp->cl_clientid.cl_boot = nn->boot_time; clp->cl_clientid.cl_id = current_clientid++; @@ -1305,6 +1304,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct nfs4_client *clp; struct sockaddr *sa = svc_addr(rqstp); int ret; + struct net *net = SVC_NET(rqstp); clp = alloc_client(name); if (clp == NULL) @@ -1335,6 +1335,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); clp->cl_cb_session = NULL; + clp->net = net; return clp; } @@ -1471,7 +1472,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r else goto out_err; - conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val, + conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val, se->se_callback_addr_len, (struct sockaddr *)&conn->cb_addr, sizeof(conn->cb_addr)); @@ -1619,6 +1620,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, nfs4_verifier verf = exid->verifier; struct sockaddr *sa = svc_addr(rqstp); bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " @@ -1701,7 +1703,7 @@ out_new: } new->cl_minorversion = 1; - gen_clid(new); + gen_clid(new, nn); add_to_unconfirmed(new); out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; @@ -2229,7 +2231,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_verifier clverifier = setclid->se_verf; struct nfs4_client *conf, *unconf, *new; __be32 status; - + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + /* Cases below refer to rfc 3530 section 14.2.33: */ nfs4_lock_state(); conf = find_confirmed_client_by_name(&clname); @@ -2258,7 +2261,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* case 1: probable callback update */ copy_clid(new, conf); else /* case 4 (new client) or cases 2, 3 (client reboot): */ - gen_clid(new); + gen_clid(new, nn); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 029217ad2cb0..ca8ee8c3ae74 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -283,6 +283,7 @@ struct nfs4_client { unsigned long cl_cb_slot_busy; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ + struct net *net; }; static inline void -- cgit v1.2.1 From 52e19c09a183d82d99f10c284bc8b27933b1d1fc Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:16 +0300 Subject: nfsd: make reclaim_str_hashtbl allocated per net This hash holds nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace. Note: this hash is used only by legacy tracker. So let's allocate hash in tracker init. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 12 ++++++ fs/nfsd/nfs4recover.c | 100 ++++++++++++++++++++++++++++++++++++++------------ fs/nfsd/nfs4state.c | 42 ++++++++------------- fs/nfsd/state.h | 12 +++--- 4 files changed, 111 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 65c2431ea32f..49e54790d862 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -24,6 +24,11 @@ #include #include +/* Hash tables for nfs4_clientid state */ +#define CLIENT_HASH_BITS 4 +#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) + struct cld_net; struct nfsd_net { @@ -38,6 +43,13 @@ struct nfsd_net { struct lock_manager nfsd4_manager; bool grace_ended; time_t boot_time; + + /* + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + */ + struct list_head *reclaim_str_hashtbl; + int reclaim_str_hashtbl_size; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9881bcad264b..376692ab1b3b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -176,6 +176,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) struct dentry *dir, *dentry; struct nfs4_client_reclaim *crp; int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); @@ -222,7 +223,7 @@ out_unlock: mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { if (in_grace) { - crp = nfs4_client_to_reclaim(dname); + crp = nfs4_client_to_reclaim(dname, nn); if (crp) crp->cr_clp = clp; } @@ -237,7 +238,7 @@ out_unlock: nfs4_reset_creds(original_cred); } -typedef int (recdir_func)(struct dentry *, struct dentry *); +typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); struct name_list { char name[HEXDIR_LEN]; @@ -263,7 +264,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen, } static int -nfsd4_list_rec_dir(recdir_func *f) +nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; struct dentry *dir = rec_file->f_path.dentry; @@ -292,7 +293,7 @@ nfsd4_list_rec_dir(recdir_func *f) status = PTR_ERR(dentry); break; } - status = f(dir, dentry); + status = f(dir, dentry, nn); dput(dentry); } list_del(&entry->list); @@ -336,6 +337,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) struct nfs4_client_reclaim *crp; char dname[HEXDIR_LEN]; int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; @@ -359,9 +361,9 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) vfs_fsync(rec_file, 0); if (in_grace) { /* remove reclaim record */ - crp = nfsd4_find_reclaim_client(dname); + crp = nfsd4_find_reclaim_client(dname, nn); if (crp) - nfs4_remove_reclaim_record(crp); + nfs4_remove_reclaim_record(crp, nn); } } out_drop_write: @@ -373,11 +375,11 @@ out: } static int -purge_old(struct dentry *parent, struct dentry *child) +purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { int status; - if (nfs4_has_reclaimed_state(child->d_name.name)) + if (nfs4_has_reclaimed_state(child->d_name.name, nn)) return 0; status = vfs_rmdir(parent->d_inode, child); @@ -392,6 +394,7 @@ static void nfsd4_recdir_purge_old(struct net *net, time_t boot_time) { int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); in_grace = false; if (!rec_file) @@ -399,19 +402,19 @@ nfsd4_recdir_purge_old(struct net *net, time_t boot_time) status = mnt_want_write_file(rec_file); if (status) goto out; - status = nfsd4_list_rec_dir(purge_old); + status = nfsd4_list_rec_dir(purge_old, nn); if (status == 0) vfs_fsync(rec_file, 0); mnt_drop_write_file(rec_file); out: - nfs4_release_reclaim(); + nfs4_release_reclaim(nn); if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %s\n", rec_file->f_path.dentry->d_name.name); } static int -load_recdir(struct dentry *parent, struct dentry *child) +load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { if (child->d_name.len != HEXDIR_LEN - 1) { printk("nfsd4: illegal name %s in recovery directory\n", @@ -419,18 +422,19 @@ load_recdir(struct dentry *parent, struct dentry *child) /* Keep trying; maybe the others are OK: */ return 0; } - nfs4_client_to_reclaim(child->d_name.name); + nfs4_client_to_reclaim(child->d_name.name, nn); return 0; } static int -nfsd4_recdir_load(void) { +nfsd4_recdir_load(struct net *net) { int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!rec_file) return 0; - status = nfsd4_list_rec_dir(load_recdir); + status = nfsd4_list_rec_dir(load_recdir, nn); if (status) printk("nfsd4: failed loading clients from recovery" " directory %s\n", rec_file->f_path.dentry->d_name.name); @@ -474,11 +478,53 @@ nfsd4_init_recdir(void) return status; } + +static int +nfs4_legacy_state_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->reclaim_str_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); + nn->reclaim_str_hashtbl_size = 0; + + return 0; +} + +static void +nfs4_legacy_state_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + kfree(nn->reclaim_str_hashtbl); +} + static int nfsd4_load_reboot_recovery_data(struct net *net) { int status; + nfs4_lock_state(); + status = nfsd4_init_recdir(); + if (!status) + status = nfsd4_recdir_load(net); + nfs4_unlock_state(); + if (status) + printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); + return status; +} + +static int +nfsd4_legacy_tracking_init(struct net *net) +{ + int status; + /* XXX: The legacy code won't work in a container */ if (net != &init_net) { WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " @@ -486,13 +532,17 @@ nfsd4_load_reboot_recovery_data(struct net *net) return -EINVAL; } - nfs4_lock_state(); - status = nfsd4_init_recdir(); - if (!status) - status = nfsd4_recdir_load(); - nfs4_unlock_state(); + status = nfs4_legacy_state_init(net); if (status) - printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); + return status; + + status = nfsd4_load_reboot_recovery_data(net); + if (status) + goto err; + return 0; + +err: + nfs4_legacy_state_shutdown(net); return status; } @@ -508,8 +558,11 @@ nfsd4_shutdown_recdir(void) static void nfsd4_legacy_tracking_exit(struct net *net) { - nfs4_release_reclaim(); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_release_reclaim(nn); nfsd4_shutdown_recdir(); + nfs4_legacy_state_shutdown(net); } /* @@ -545,6 +598,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) int status; char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -557,7 +611,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) } /* look for it in the reclaim hashtable otherwise */ - crp = nfsd4_find_reclaim_client(dname); + crp = nfsd4_find_reclaim_client(dname, nn); if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); crp->cr_clp = clp; @@ -568,7 +622,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) } static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { - .init = nfsd4_load_reboot_recovery_data, + .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, .remove = nfsd4_remove_clid_dir, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 001bbc99d7a4..ba4785559509 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -393,11 +393,6 @@ unhash_delegation(struct nfs4_delegation *dp) /* client_lock protects the client lru list and session hash table */ static DEFINE_SPINLOCK(client_lock); -/* Hash tables for nfs4_clientid state */ -#define CLIENT_HASH_BITS 4 -#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) -#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) - static unsigned int clientid_hashval(u32 id) { return id & CLIENT_HASH_MASK; @@ -409,11 +404,8 @@ static unsigned int clientstr_hashval(const char *name) } /* - * reclaim_str_hashtbl[] holds known client info from previous reset/reboot - * used in reboot/reset lease grace period processing - * * conf_id_hashtbl[], and conf_name_tree hold confirmed - * setclientid_confirmed info. + * setclientid_confirmed info. * * unconf_id_hashtbl[] and unconf_name_tree hold unconfirmed * setclientid info. @@ -426,8 +418,6 @@ static unsigned int clientstr_hashval(const char *name) * * All of the above fields are protected by the client_mutex. */ -static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE]; -static int reclaim_str_hashtbl_size = 0; static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct rb_root conf_name_tree; @@ -4509,11 +4499,11 @@ alloc_reclaim(void) } bool -nfs4_has_reclaimed_state(const char *name) +nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn) { struct nfs4_client_reclaim *crp; - crp = nfsd4_find_reclaim_client(name); + crp = nfsd4_find_reclaim_client(name, nn); return (crp && crp->cr_clp); } @@ -4521,7 +4511,7 @@ nfs4_has_reclaimed_state(const char *name) * failure => all reset bets are off, nfserr_no_grace... */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(const char *name) +nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; @@ -4531,42 +4521,42 @@ nfs4_client_to_reclaim(const char *name) if (crp) { strhashval = clientstr_hashval(name); INIT_LIST_HEAD(&crp->cr_strhash); - list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); + list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); memcpy(crp->cr_recdir, name, HEXDIR_LEN); crp->cr_clp = NULL; - reclaim_str_hashtbl_size++; + nn->reclaim_str_hashtbl_size++; } return crp; } void -nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp) +nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp); - reclaim_str_hashtbl_size--; + nn->reclaim_str_hashtbl_size--; } void -nfs4_release_reclaim(void) +nfs4_release_reclaim(struct nfsd_net *nn) { struct nfs4_client_reclaim *crp = NULL; int i; for (i = 0; i < CLIENT_HASH_SIZE; i++) { - while (!list_empty(&reclaim_str_hashtbl[i])) { - crp = list_entry(reclaim_str_hashtbl[i].next, + while (!list_empty(&nn->reclaim_str_hashtbl[i])) { + crp = list_entry(nn->reclaim_str_hashtbl[i].next, struct nfs4_client_reclaim, cr_strhash); - nfs4_remove_reclaim_record(crp); + nfs4_remove_reclaim_record(crp, nn); } } - BUG_ON(reclaim_str_hashtbl_size); + BUG_ON(nn->reclaim_str_hashtbl_size); } /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ struct nfs4_client_reclaim * -nfsd4_find_reclaim_client(const char *recdir) +nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; @@ -4574,7 +4564,7 @@ nfsd4_find_reclaim_client(const char *recdir) dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir); strhashval = clientstr_hashval(recdir); - list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { + list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { if (same_name(crp->cr_recdir, recdir)) { return crp; } @@ -4732,7 +4722,6 @@ nfs4_state_init(void) for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&unconf_id_hashtbl[i]); - INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); } conf_name_tree = RB_ROOT; unconf_name_tree = RB_ROOT; @@ -4749,7 +4738,6 @@ nfs4_state_init(void) INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); - reclaim_str_hashtbl_size = 0; } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ca8ee8c3ae74..26a912cdfe0c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -466,9 +466,10 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net, stateid_t *stateid, int flags, struct file **filp); extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); -void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *); -extern void nfs4_release_reclaim(void); -extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir); +void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); +extern void nfs4_release_reclaim(struct nfsd_net *); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, + struct nfsd_net *nn); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); extern void nfs4_free_openowner(struct nfs4_openowner *); extern void nfs4_free_lockowner(struct nfs4_lockowner *); @@ -482,8 +483,9 @@ extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); -extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name); -extern bool nfs4_has_reclaimed_state(const char *name); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, + struct nfsd_net *nn); +extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); extern void release_session_client(struct nfsd4_session *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); -- cgit v1.2.1 From 8daae4dc0d09d44d38194f72bc91740b46a6ce53 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:21 +0300 Subject: nfsd: make conf_id_hashtbl allocated per net This hash holds nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace. Note: this hash can be allocated in per-net operations. But it looks better to allocate it on nfsd state start and thus don't waste resources if server is not running. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 75 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 49e54790d862..0cc85e95e8a4 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -50,6 +50,7 @@ struct nfsd_net { */ struct list_head *reclaim_str_hashtbl; int reclaim_str_hashtbl_size; + struct list_head *conf_id_hashtbl; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ba4785559509..6df427773d01 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -418,7 +418,6 @@ static unsigned int clientstr_hashval(const char *name) * * All of the above fields are protected by the client_mutex. */ -static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct rb_root conf_name_tree; static struct rb_root unconf_name_tree; @@ -1385,9 +1384,10 @@ static void move_to_confirmed(struct nfs4_client *clp) { unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); - list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &unconf_name_tree); add_clp_to_name_tree(clp, &conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); @@ -1395,12 +1395,12 @@ move_to_confirmed(struct nfs4_client *clp) } static struct nfs4_client * -find_confirmed_client(clientid_t *clid, bool sessions) +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); - list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { + list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; @@ -1787,6 +1787,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_conn *conn; struct nfsd4_clid_slot *cs_slot = NULL; __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; @@ -1802,7 +1803,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid, true); - conf = find_confirmed_client(&cr_ses->clientid, true); + conf = find_confirmed_client(&cr_ses->clientid, true, nn); if (conf) { cs_slot = &conf->cl_cs_slot; @@ -2142,10 +2143,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta { struct nfs4_client *conf, *unconf, *clp; __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); nfs4_lock_state(); unconf = find_unconfirmed_client(&dc->clientid, true); - conf = find_confirmed_client(&dc->clientid, true); + conf = find_confirmed_client(&dc->clientid, true, nn); if (conf) { clp = conf; @@ -2280,7 +2282,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, return nfserr_stale_clientid; nfs4_lock_state(); - conf = find_confirmed_client(clid, false); + conf = find_confirmed_client(clid, false, nn); unconf = find_unconfirmed_client(clid, false); /* * We try hard to give out unique clientid's, so if we get an @@ -2656,7 +2658,8 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, oo = find_openstateowner_str(strhashval, open, cstate->minorversion); open->op_openowner = oo; if (!oo) { - clp = find_confirmed_client(clientid, cstate->minorversion); + clp = find_confirmed_client(clientid, cstate->minorversion, + nn); if (clp == NULL) return nfserr_expired; goto new_owner; @@ -3152,7 +3155,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_stale_clientid; if (STALE_CLIENTID(clid, nn)) goto out; - clp = find_confirmed_client(clid, cstate->minorversion); + clp = find_confirmed_client(clid, cstate->minorversion, nn); status = nfserr_expired; if (clp == NULL) { /* We assume the client took too long to RENEW. */ @@ -3428,7 +3431,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s return nfserr_bad_stateid; if (STALE_STATEID(stateid, nn)) return nfserr_stale_stateid; - cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions); + cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn); if (!cl) return nfserr_expired; *s = find_stateid_by_type(cl, stateid, typemask); @@ -4579,9 +4582,10 @@ __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions) { struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); /* find clientid in conf_id_hashtbl */ - clp = find_confirmed_client(clid, sessions); + clp = find_confirmed_client(clid, sessions, nn); if (clp == NULL) return nfserr_reclaim_bad; @@ -4720,7 +4724,6 @@ nfs4_state_init(void) int i; for (i = 0; i < CLIENT_HASH_SIZE; i++) { - INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&unconf_id_hashtbl[i]); } conf_name_tree = RB_ROOT; @@ -4761,6 +4764,38 @@ set_max_delegations(void) max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); } +static int nfs4_state_start_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->conf_id_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); + + return 0; +} + +static void +__nfs4_state_shutdown_net(struct net *net) +{ + int i; + struct nfs4_client *clp = NULL; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->conf_id_hashtbl[i])) { + clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } + } + kfree(nn->conf_id_hashtbl); +} + /* initialization to perform when the nfsd service is started: */ int @@ -4778,6 +4813,9 @@ nfs4_state_start(void) * basis. */ get_net(net); + ret = nfs4_state_start_net(net); + if (ret) + return ret; nfsd4_client_tracking_init(net); nn->boot_time = get_seconds(); locks_start_grace(net, &nn->nfsd4_manager); @@ -4804,26 +4842,21 @@ out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: nfsd4_client_tracking_exit(net); + __nfs4_state_shutdown_net(net); put_net(net); return ret; } /* should be called with the state lock held */ static void -__nfs4_state_shutdown(void) +__nfs4_state_shutdown(struct net *net) { - int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct rb_node *node, *tmp; - for (i = 0; i < CLIENT_HASH_SIZE; i++) { - while (!list_empty(&conf_id_hashtbl[i])) { - clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); - destroy_client(clp); - } - } + __nfs4_state_shutdown_net(net); node = rb_first(&unconf_name_tree); while (node != NULL) { @@ -4860,7 +4893,7 @@ nfs4_state_shutdown(void) destroy_workqueue(laundry_wq); locks_end_grace(&nn->nfsd4_manager); nfs4_lock_state(); - __nfs4_state_shutdown(); + __nfs4_state_shutdown(net); nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } -- cgit v1.2.1 From 382a62e76cbf91fb364a4cd8732761e4ecf62153 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:26 +0300 Subject: nfsd: make conf_name_tree per net This tree holds nfs4_clients info, which are network namespace aware. So let's make it per network namespace. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfs4state.c | 31 ++++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 0cc85e95e8a4..afd911638464 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -47,10 +47,14 @@ struct nfsd_net { /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing + * + * conf_id_hashtbl[], and conf_name_tree hold confirmed + * setclientid_confirmed info. */ struct list_head *reclaim_str_hashtbl; int reclaim_str_hashtbl_size; struct list_head *conf_id_hashtbl; + struct rb_root conf_name_tree; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6df427773d01..d40e57b9051a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -404,9 +404,6 @@ static unsigned int clientstr_hashval(const char *name) } /* - * conf_id_hashtbl[], and conf_name_tree hold confirmed - * setclientid_confirmed info. - * * unconf_id_hashtbl[] and unconf_name_tree hold unconfirmed * setclientid info. * @@ -419,7 +416,6 @@ static unsigned int clientstr_hashval(const char *name) * All of the above fields are protected by the client_mutex. */ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; -static struct rb_root conf_name_tree; static struct rb_root unconf_name_tree; static struct list_head client_lru; static struct list_head close_lru; @@ -1114,6 +1110,7 @@ destroy_client(struct nfs4_client *clp) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); @@ -1136,7 +1133,7 @@ destroy_client(struct nfs4_client *clp) svc_xprt_put(clp->cl_cb_conn.cb_xprt); list_del(&clp->cl_idhash); if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) - rb_erase(&clp->cl_namenode, &conf_name_tree); + rb_erase(&clp->cl_namenode, &nn->conf_name_tree); else rb_erase(&clp->cl_namenode, &unconf_name_tree); spin_lock(&client_lock); @@ -1389,7 +1386,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &unconf_name_tree); - add_clp_to_name_tree(clp, &conf_name_tree); + add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); renew_client(clp); } @@ -1433,9 +1430,9 @@ static bool clp_used_exchangeid(struct nfs4_client *clp) } static struct nfs4_client * -find_confirmed_client_by_name(struct xdr_netobj *name) +find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { - return find_clp_in_name_tree(name, &conf_name_tree); + return find_clp_in_name_tree(name, &nn->conf_name_tree); } static struct nfs4_client * @@ -1635,7 +1632,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, /* Cases below refer to rfc 5661 section 18.35.4: */ nfs4_lock_state(); - conf = find_confirmed_client_by_name(&exid->clname); + conf = find_confirmed_client_by_name(&exid->clname, nn); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); bool verfs_match = same_verf(&verf, &conf->cl_verifier); @@ -1829,7 +1826,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfserr_seq_misordered; goto out_free_conn; } - old = find_confirmed_client_by_name(&unconf->cl_name); + old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) expire_client(old); move_to_confirmed(unconf); @@ -2227,7 +2224,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Cases below refer to rfc 3530 section 14.2.33: */ nfs4_lock_state(); - conf = find_confirmed_client_by_name(&clname); + conf = find_confirmed_client_by_name(&clname, nn); if (conf) { /* case 0: */ status = nfserr_clid_inuse; @@ -2309,7 +2306,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfsd4_probe_callback(conf); expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ - conf = find_confirmed_client_by_name(&unconf->cl_name); + conf = find_confirmed_client_by_name(&unconf->cl_name, nn); if (conf) expire_client(conf); move_to_confirmed(unconf); @@ -4726,7 +4723,6 @@ nfs4_state_init(void) for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&unconf_id_hashtbl[i]); } - conf_name_tree = RB_ROOT; unconf_name_tree = RB_ROOT; for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&sessionid_hashtbl[i]); @@ -4772,12 +4768,17 @@ static int nfs4_state_start_net(struct net *net) nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) * CLIENT_HASH_SIZE, GFP_KERNEL); if (!nn->conf_id_hashtbl) - return -ENOMEM; + goto err; - for (i = 0; i < CLIENT_HASH_SIZE; i++) + for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); + } + nn->conf_name_tree = RB_ROOT; return 0; + +err: + return -ENOMEM; } static void -- cgit v1.2.1 From 0a7ec37727dcc3293cd4c9958b25c43f3a797d47 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:31 +0300 Subject: nfsd: make unconf_id_hashtbl allocated per net This hash holds nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace. Note: this hash can be allocated in per-net operations. But it looks better to allocate it on nfsd state start and thus don't waste resources if server is not running. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 25 +++++++++++++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index afd911638464..1ff781f9c3d0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -55,6 +55,7 @@ struct nfsd_net { int reclaim_str_hashtbl_size; struct list_head *conf_id_hashtbl; struct rb_root conf_name_tree; + struct list_head *unconf_id_hashtbl; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d40e57b9051a..f33bbfbdc24f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -415,7 +415,6 @@ static unsigned int clientstr_hashval(const char *name) * * All of the above fields are protected by the client_mutex. */ -static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct rb_root unconf_name_tree; static struct list_head client_lru; static struct list_head close_lru; @@ -1369,11 +1368,12 @@ static void add_to_unconfirmed(struct nfs4_client *clp) { unsigned int idhashval; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); add_clp_to_name_tree(clp, &unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); - list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); + list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); renew_client(clp); } @@ -1409,12 +1409,12 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) } static struct nfs4_client * -find_unconfirmed_client(clientid_t *clid, bool sessions) +find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); - list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { + list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; @@ -1799,7 +1799,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_session; nfs4_lock_state(); - unconf = find_unconfirmed_client(&cr_ses->clientid, true); + unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); if (conf) { @@ -2143,7 +2143,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); nfs4_lock_state(); - unconf = find_unconfirmed_client(&dc->clientid, true); + unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); if (conf) { @@ -2280,7 +2280,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfs4_lock_state(); conf = find_confirmed_client(clid, false, nn); - unconf = find_unconfirmed_client(clid, false); + unconf = find_unconfirmed_client(clid, false, nn); /* * We try hard to give out unique clientid's, so if we get an * attempt to confirm the same clientid with a different cred, @@ -4720,9 +4720,6 @@ nfs4_state_init(void) { int i; - for (i = 0; i < CLIENT_HASH_SIZE; i++) { - INIT_LIST_HEAD(&unconf_id_hashtbl[i]); - } unconf_name_tree = RB_ROOT; for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&sessionid_hashtbl[i]); @@ -4769,14 +4766,21 @@ static int nfs4_state_start_net(struct net *net) CLIENT_HASH_SIZE, GFP_KERNEL); if (!nn->conf_id_hashtbl) goto err; + nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) * + CLIENT_HASH_SIZE, GFP_KERNEL); + if (!nn->unconf_id_hashtbl) + goto err_unconf_id; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); + INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } nn->conf_name_tree = RB_ROOT; return 0; +err_unconf_id: + kfree(nn->conf_id_hashtbl); err: return -ENOMEM; } @@ -4794,6 +4798,7 @@ __nfs4_state_shutdown_net(struct net *net) destroy_client(clp); } } + kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); } -- cgit v1.2.1 From a99454aa4ff1241a19dcb486fa302d3e8cc09e5b Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:36 +0300 Subject: nfsd: make unconf_name_tree per net This hash holds nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfs4state.c | 42 +++++++++++++++++++----------------------- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1ff781f9c3d0..1e76030e1d16 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -50,12 +50,16 @@ struct nfsd_net { * * conf_id_hashtbl[], and conf_name_tree hold confirmed * setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed + * setclientid info. */ struct list_head *reclaim_str_hashtbl; int reclaim_str_hashtbl_size; struct list_head *conf_id_hashtbl; struct rb_root conf_name_tree; struct list_head *unconf_id_hashtbl; + struct rb_root unconf_name_tree; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f33bbfbdc24f..b35329199e35 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -404,9 +404,6 @@ static unsigned int clientstr_hashval(const char *name) } /* - * unconf_id_hashtbl[] and unconf_name_tree hold unconfirmed - * setclientid info. - * * client_lru holds client queue ordered by nfs4_client.cl_time * for lease renewal. * @@ -415,7 +412,6 @@ static unsigned int clientstr_hashval(const char *name) * * All of the above fields are protected by the client_mutex. */ -static struct rb_root unconf_name_tree; static struct list_head client_lru; static struct list_head close_lru; @@ -1134,7 +1130,7 @@ destroy_client(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) rb_erase(&clp->cl_namenode, &nn->conf_name_tree); else - rb_erase(&clp->cl_namenode, &unconf_name_tree); + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); spin_lock(&client_lock); unhash_client_locked(clp); if (atomic_read(&clp->cl_refcount) == 0) @@ -1371,7 +1367,7 @@ add_to_unconfirmed(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); - add_clp_to_name_tree(clp, &unconf_name_tree); + add_clp_to_name_tree(clp, &nn->unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); renew_client(clp); @@ -1385,7 +1381,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); - rb_erase(&clp->cl_namenode, &unconf_name_tree); + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); renew_client(clp); @@ -1436,9 +1432,9 @@ find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) } static struct nfs4_client * -find_unconfirmed_client_by_name(struct xdr_netobj *name) +find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { - return find_clp_in_name_tree(name, &unconf_name_tree); + return find_clp_in_name_tree(name, &nn->unconf_name_tree); } static void @@ -1677,7 +1673,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, goto out; } - unconf = find_unconfirmed_client_by_name(&exid->clname); + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ expire_client(unconf); @@ -2239,7 +2235,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } } - unconf = find_unconfirmed_client_by_name(&clname); + unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) expire_client(unconf); status = nfserr_jukebox; @@ -4720,7 +4716,6 @@ nfs4_state_init(void) { int i; - unconf_name_tree = RB_ROOT; for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&sessionid_hashtbl[i]); for (i = 0; i < FILE_HASH_SIZE; i++) { @@ -4776,6 +4771,7 @@ static int nfs4_state_start_net(struct net *net) INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } nn->conf_name_tree = RB_ROOT; + nn->unconf_name_tree = RB_ROOT; return 0; @@ -4791,6 +4787,7 @@ __nfs4_state_shutdown_net(struct net *net) int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct rb_node *node, *tmp; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { @@ -4798,6 +4795,16 @@ __nfs4_state_shutdown_net(struct net *net) destroy_client(clp); } } + + node = rb_first(&nn->unconf_name_tree); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + clp = rb_entry(tmp, struct nfs4_client, cl_namenode); + rb_erase(tmp, &nn->unconf_name_tree); + destroy_client(clp); + } + kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); } @@ -4857,22 +4864,11 @@ out_recovery: static void __nfs4_state_shutdown(struct net *net) { - struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; - struct rb_node *node, *tmp; __nfs4_state_shutdown_net(net); - node = rb_first(&unconf_name_tree); - while (node != NULL) { - tmp = node; - node = rb_next(tmp); - clp = rb_entry(tmp, struct nfs4_client, cl_namenode); - rb_erase(tmp, &unconf_name_tree); - destroy_client(clp); - } - INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); list_for_each_safe(pos, next, &del_recall_lru) { -- cgit v1.2.1 From 9b5311374057e5c87017ea3756e566047c9b61e7 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:41 +0300 Subject: nfsd: make ownerstr_hashtbl allocated per net This hash holds open owner state and closely associated with nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace too. Note: this hash can be allocated in per-net operations. But it looks better to allocate it on nfsd state start and thus don't waste resources if server is not running. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1e76030e1d16..46cca9494c7a 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -60,6 +60,7 @@ struct nfsd_net { struct rb_root conf_name_tree; struct list_head *unconf_id_hashtbl; struct rb_root unconf_name_tree; + struct list_head *ownerstr_hashtbl; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b35329199e35..f68514d8210e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -176,8 +176,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) @@ -2428,7 +2426,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } @@ -2486,13 +2486,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, } static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions) +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, + bool sessions, struct nfsd_net *nn) { struct nfs4_stateowner *so; struct nfs4_openowner *oo; struct nfs4_client *clp; - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) { if (!so->so_is_open_owner) continue; if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { @@ -2648,7 +2649,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, return nfserr_jukebox; strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); - oo = find_openstateowner_str(strhashval, open, cstate->minorversion); + oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn); open->op_openowner = oo; if (!oo) { clp = find_confirmed_client(clientid, cstate->minorversion, @@ -3976,8 +3977,9 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s struct inode *inode = open_stp->st_file->fi_inode; unsigned int inohash = lockowner_ino_hashval(inode, clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } @@ -4458,7 +4460,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, status = nfserr_locks_held; INIT_LIST_HEAD(&matches); - list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { + list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) { if (sop->so_is_open_owner) continue; if (!same_owner_str(sop, owner, clid)) @@ -4614,13 +4616,14 @@ static void release_openowner_sop(struct nfs4_stateowner *sop) } static int nfsd_release_n_owners(u64 num, bool is_open_owner, - void (*release_sop)(struct nfs4_stateowner *)) + void (*release_sop)(struct nfs4_stateowner *), + struct nfsd_net *nn) { int i, count = 0; struct nfs4_stateowner *sop, *next; for (i = 0; i < OWNER_HASH_SIZE; i++) { - list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { + list_for_each_entry_safe(sop, next, &nn->ownerstr_hashtbl[i], so_strhash) { if (sop->so_is_open_owner != is_open_owner) continue; release_sop(sop); @@ -4634,9 +4637,10 @@ static int nfsd_release_n_owners(u64 num, bool is_open_owner, void nfsd_forget_locks(u64 num) { int count; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); - count = nfsd_release_n_owners(num, false, release_lockowner_sop); + count = nfsd_release_n_owners(num, false, release_lockowner_sop, nn); nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d locks", count); @@ -4645,9 +4649,10 @@ void nfsd_forget_locks(u64 num) void nfsd_forget_openowners(u64 num) { int count; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); - count = nfsd_release_n_owners(num, true, release_openowner_sop); + count = nfsd_release_n_owners(num, true, release_openowner_sop, nn); nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d open owners", count); @@ -4721,9 +4726,6 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&ownerstr_hashtbl[i]); - } for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]); INIT_LIST_HEAD(&close_lru); @@ -4765,16 +4767,24 @@ static int nfs4_state_start_net(struct net *net) CLIENT_HASH_SIZE, GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; + nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * + OWNER_HASH_SIZE, GFP_KERNEL); + if (!nn->ownerstr_hashtbl) + goto err_ownerstr; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } + for (i = 0; i < OWNER_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; return 0; +err_ownerstr: + kfree(nn->unconf_id_hashtbl); err_unconf_id: kfree(nn->conf_id_hashtbl); err: @@ -4805,6 +4815,7 @@ __nfs4_state_shutdown_net(struct net *net) destroy_client(clp); } + kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); } -- cgit v1.2.1 From 20e9e2bc98b907efe82621797c561f6169d63d96 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:46 +0300 Subject: nfsd: make lockowner_ino_hashtbl allocated per net This hash holds file lock owners and closely associated with nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace too. Note: this hash can be allocated in per-net operations. But it looks better to allocate it on nfsd state start and thus don't waste resources if server is not running. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfs4state.c | 27 ++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 46cca9494c7a..2281f6df5573 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -29,6 +29,9 @@ #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) +#define LOCKOWNER_INO_HASH_BITS 8 +#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) + struct cld_net; struct nfsd_net { @@ -61,6 +64,7 @@ struct nfsd_net { struct list_head *unconf_id_hashtbl; struct rb_root unconf_name_tree; struct list_head *ownerstr_hashtbl; + struct list_head *lockowner_ino_hashtbl; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f68514d8210e..1e76d55a3e9a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3862,8 +3862,6 @@ out: #define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) -#define LOCKOWNER_INO_HASH_BITS 8 -#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) #define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) static inline u64 @@ -3893,8 +3891,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct & LOCKOWNER_INO_HASH_MASK; } -static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE]; - /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th @@ -3960,12 +3956,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c static struct nfs4_lockowner * find_lockowner_str(struct inode *inode, clientid_t *clid, - struct xdr_netobj *owner) + struct xdr_netobj *owner, struct nfsd_net *nn) { unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); struct nfs4_lockowner *lo; - list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { + list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { if (same_lockowner_ino(lo, inode, clid, owner)) return lo; } @@ -3980,7 +3976,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); - list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); + list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]); list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } @@ -4054,8 +4050,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s struct nfs4_client *cl = oo->oo_owner.so_client; struct nfs4_lockowner *lo; unsigned int strhashval; + struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id); - lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); + lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, + &lock->v.new.owner, nn); if (lo) { if (!cstate->minorversion) return nfserr_bad_seqid; @@ -4308,7 +4306,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); + lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn); if (lo) file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; @@ -4726,8 +4724,6 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) - INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]); INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); @@ -4771,6 +4767,10 @@ static int nfs4_state_start_net(struct net *net) OWNER_HASH_SIZE, GFP_KERNEL); if (!nn->ownerstr_hashtbl) goto err_ownerstr; + nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) * + LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL); + if (!nn->lockowner_ino_hashtbl) + goto err_lockowner_ino; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); @@ -4778,11 +4778,15 @@ static int nfs4_state_start_net(struct net *net) } for (i = 0; i < OWNER_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); + for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; return 0; +err_lockowner_ino: + kfree(nn->ownerstr_hashtbl); err_ownerstr: kfree(nn->unconf_id_hashtbl); err_unconf_id: @@ -4815,6 +4819,7 @@ __nfs4_state_shutdown_net(struct net *net) destroy_client(clp); } + kfree(nn->lockowner_ino_hashtbl); kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); -- cgit v1.2.1 From 1872de0e8171904612ee85de218fa045bc473cad Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:51 +0300 Subject: nfsd: make sessionid_hashtbl allocated per net This hash holds established sessions state and closely associated with nfs4_clients info, which are network namespace aware. So let's make it allocated per network namespace too. Note: this hash can be allocated in per-net operations. But it looks better to allocate it on nfsd state start and thus don't waste resources if server is not running. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 3 +++ fs/nfsd/nfs4state.c | 28 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 2281f6df5573..da33d3f804b0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -32,6 +32,8 @@ #define LOCKOWNER_INO_HASH_BITS 8 #define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) +#define SESSION_HASH_SIZE 512 + struct cld_net; struct nfsd_net { @@ -65,6 +67,7 @@ struct nfsd_net { struct rb_root unconf_name_tree; struct list_head *ownerstr_hashtbl; struct list_head *lockowner_ino_hashtbl; + struct list_head *sessionid_hashtbl; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1e76d55a3e9a..248f217a00bc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -632,9 +632,6 @@ static void release_openowner(struct nfs4_openowner *oo) nfs4_free_openowner(oo); } -#define SESSION_HASH_SIZE 512 -static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; - static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { @@ -928,6 +925,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); new->se_client = clp; gen_sessionid(new); @@ -941,7 +939,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru kref_init(&new->se_ref); idx = hash_sessionid(&new->se_sessionid); spin_lock(&client_lock); - list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); @@ -963,15 +961,16 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru /* caller must hold client_lock */ static struct nfsd4_session * -find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) { struct nfsd4_session *elem; int idx; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); /* Search in the appropriate list */ - list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { + list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) { if (!memcmp(elem->se_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { return elem; @@ -1905,7 +1904,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; spin_lock(&client_lock); - cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); + cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); /* Sorta weird: we only need the refcnt'ing because new_conn acquires * client_lock iself: */ if (cstate->session) { @@ -1954,7 +1953,7 @@ nfsd4_destroy_session(struct svc_rqst *r, } dump_sessionid(__func__, &sessionid->sessionid); spin_lock(&client_lock); - ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); if (!ses) { spin_unlock(&client_lock); goto out; @@ -2050,7 +2049,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, spin_lock(&client_lock); status = nfserr_badsession; - session = find_in_sessionid_hashtbl(&seq->sessionid); + session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); if (!session) goto out; @@ -4719,8 +4718,6 @@ nfs4_state_init(void) { int i; - for (i = 0; i < SESSION_HASH_SIZE; i++) - INIT_LIST_HEAD(&sessionid_hashtbl[i]); for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } @@ -4771,6 +4768,10 @@ static int nfs4_state_start_net(struct net *net) LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL); if (!nn->lockowner_ino_hashtbl) goto err_lockowner_ino; + nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * + SESSION_HASH_SIZE, GFP_KERNEL); + if (!nn->sessionid_hashtbl) + goto err_sessionid; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); @@ -4780,11 +4781,15 @@ static int nfs4_state_start_net(struct net *net) INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]); + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; return 0; +err_sessionid: + kfree(nn->lockowner_ino_hashtbl); err_lockowner_ino: kfree(nn->ownerstr_hashtbl); err_ownerstr: @@ -4819,6 +4824,7 @@ __nfs4_state_shutdown_net(struct net *net) destroy_client(clp); } + kfree(nn->sessionid_hashtbl); kfree(nn->lockowner_ino_hashtbl); kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); -- cgit v1.2.1 From 5ed58bb243484e01e82ffca8451907403168e262 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:21:56 +0300 Subject: nfsd: make client_lru list per net This list holds nfs4 clients queue for lease renewal, which are network namespace aware. So let's make this list per network namespace too. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 5 +++++ fs/nfsd/nfs4state.c | 16 ++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index da33d3f804b0..9a98a0aeee68 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -68,6 +68,11 @@ struct nfsd_net { struct list_head *ownerstr_hashtbl; struct list_head *lockowner_ino_hashtbl; struct list_head *sessionid_hashtbl; + /* + * client_lru holds client queue ordered by nfs4_client.cl_time + * for lease renewal. + */ + struct list_head client_lru; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 248f217a00bc..9cf7e9bf3691 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -402,15 +402,11 @@ static unsigned int clientstr_hashval(const char *name) } /* - * client_lru holds client queue ordered by nfs4_client.cl_time - * for lease renewal. - * * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time * for last close replay. * * All of the above fields are protected by the client_mutex. */ -static struct list_head client_lru; static struct list_head close_lru; /* @@ -995,6 +991,8 @@ unhash_session(struct nfsd4_session *ses) static inline void renew_client_locked(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + if (is_client_expired(clp)) { WARN_ON(1); printk("%s: client (clientid %08x/%08x) already expired\n", @@ -1007,7 +1005,7 @@ renew_client_locked(struct nfs4_client *clp) dprintk("renewing client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - list_move_tail(&clp->cl_lru, &client_lru); + list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = get_seconds(); } @@ -3196,6 +3194,7 @@ nfs4_laundromat(void) time_t cutoff = get_seconds() - nfsd4_lease; time_t t, clientid_val = nfsd4_lease; time_t u, test_val = nfsd4_lease; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); @@ -3203,7 +3202,7 @@ nfs4_laundromat(void) nfsd4_end_grace(&init_net); INIT_LIST_HEAD(&reaplist); spin_lock(&client_lock); - list_for_each_safe(pos, next, &client_lru) { + list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { t = clp->cl_time - cutoff; @@ -4590,9 +4589,10 @@ void nfsd_forget_clients(u64 num) { struct nfs4_client *clp, *next; int count = 0; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); - list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { expire_client(clp); if (++count == num) break; @@ -4722,7 +4722,6 @@ nfs4_state_init(void) INIT_LIST_HEAD(&file_hashtbl[i]); } INIT_LIST_HEAD(&close_lru); - INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); } @@ -4785,6 +4784,7 @@ static int nfs4_state_start_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + INIT_LIST_HEAD(&nn->client_lru); return 0; -- cgit v1.2.1 From 73758fed711b847d833b9b0db59137eaeed06485 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:22:01 +0300 Subject: nfsd: make close_lru list per net This list holds nfs4 clients (open) stateowner queue for last close replay, which are network namespace aware. So let's make this list per network namespace too. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 6 ++++++ fs/nfsd/nfs4state.c | 20 +++++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9a98a0aeee68..a356ea3dc686 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -71,8 +71,14 @@ struct nfsd_net { /* * client_lru holds client queue ordered by nfs4_client.cl_time * for lease renewal. + * + * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time + * for last close replay. + * + * All of the above fields are protected by the client_mutex. */ struct list_head client_lru; + struct list_head close_lru; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9cf7e9bf3691..a8e406449ef6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -401,14 +401,6 @@ static unsigned int clientstr_hashval(const char *name) return opaque_hashval(name, 8) & CLIENT_HASH_MASK; } -/* - * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time - * for last close replay. - * - * All of the above fields are protected by the client_mutex. - */ -static struct list_head close_lru; - /* * We store the NONE, READ, WRITE, and BOTH bits separately in the * st_{access,deny}_bmap field of the stateid, in order to track not @@ -2465,11 +2457,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, } static void -move_to_close_lru(struct nfs4_openowner *oo) +move_to_close_lru(struct nfs4_openowner *oo, struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); - list_move_tail(&oo->oo_close_lru, &close_lru); + list_move_tail(&oo->oo_close_lru, &nn->close_lru); oo->oo_time = get_seconds(); } @@ -3242,7 +3236,7 @@ nfs4_laundromat(void) unhash_delegation(dp); } test_val = nfsd4_lease; - list_for_each_safe(pos, next, &close_lru) { + list_for_each_safe(pos, next, &nn->close_lru) { oo = container_of(pos, struct nfs4_openowner, oo_close_lru); if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { u = oo->oo_time - cutoff; @@ -3820,7 +3814,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * little while to handle CLOSE replay. */ if (list_empty(&oo->oo_owner.so_stateids)) - move_to_close_lru(oo); + move_to_close_lru(oo, SVC_NET(rqstp)); } } out: @@ -4721,7 +4715,6 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&del_recall_lru); } @@ -4785,6 +4778,7 @@ static int nfs4_state_start_net(struct net *net) nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; INIT_LIST_HEAD(&nn->client_lru); + INIT_LIST_HEAD(&nn->close_lru); return 0; -- cgit v1.2.1 From 3320fef19b542b8df9606bd8e63990dc2a3fb330 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:22:07 +0300 Subject: nfsd: use service net instead of hard-coded init_net This patch replaces init_net by SVC_NET(), where possible and also passes proper context to nested functions where required. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 13 +++++++---- fs/nfsd/nfs4state.c | 63 ++++++++++++++++++++++++++++++++--------------------- fs/nfsd/state.h | 2 +- fs/nfsd/xdr4.h | 2 +- 4 files changed, 49 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f955176f1b6f..1d2396b79574 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -40,6 +40,7 @@ #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -304,6 +305,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct nfsd4_compoundres *resp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, @@ -331,7 +334,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check seqid for replay. set nfs4_owner */ resp = rqstp->rq_resp; - status = nfsd4_process_open1(&resp->cstate, open); + status = nfsd4_process_open1(&resp->cstate, open, nn); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); @@ -354,10 +357,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -370,7 +373,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion); + status = nfs4_check_open_reclaim(&open->op_clientid, + cstate->minorversion, + nn); if (status) goto out; case NFS4_OPEN_CLAIM_FH: diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a8e406449ef6..996a8a58944d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2620,14 +2620,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open) + struct nfsd4_open *open, struct nfsd_net *nn) { clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; struct nfs4_openowner *oo = NULL; __be32 status; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); if (STALE_CLIENTID(&open->op_clientid, nn)) return nfserr_stale_clientid; @@ -3408,10 +3407,11 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) return nfs_ok; } -static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions) +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, bool sessions, + struct nfsd_net *nn) { struct nfs4_client *cl; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; @@ -3439,6 +3439,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); __be32 status; if (filpp) @@ -3450,7 +3451,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion); + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + &s, cstate->minorversion, nn); if (status) return status; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); @@ -3591,7 +3593,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, char typemask, - struct nfs4_ol_stateid **stpp) + struct nfs4_ol_stateid **stpp, + struct nfsd_net *nn) { __be32 status; struct nfs4_stid *s; @@ -3600,7 +3603,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, seqid, STATEID_VAL(stateid)); *stpp = NULL; - status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion); + status = nfsd4_lookup_stateid(stateid, typemask, &s, + cstate->minorversion, nn); if (status) return status; *stpp = openlockstateid(s); @@ -3609,13 +3613,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); } -static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { __be32 status; struct nfs4_openowner *oo; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, stpp); + NFS4_OPEN_STID, stpp, nn); if (status) return status; oo = openowner((*stpp)->st_stateowner); @@ -3631,6 +3636,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3644,7 +3650,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - NFS4_OPEN_STID, &stp); + NFS4_OPEN_STID, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -3708,6 +3714,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, { __be32 status; struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3720,7 +3727,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, nfs4_lock_state(); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, - &od->od_stateid, &stp); + &od->od_stateid, &stp, nn); if (status) goto out; status = nfserr_inval; @@ -3783,6 +3790,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3792,7 +3801,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, NFS4_OPEN_STID|NFS4_CLOSED_STID, - &stp); + &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -3831,12 +3840,14 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &dr->dr_stateid; struct nfs4_stid *s; __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; nfs4_lock_state(); - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion); + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, + cstate->minorversion, nn); if (status) goto out; dp = delegstateid(s); @@ -4085,7 +4096,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool new_state = false; int lkflg; int err; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, @@ -4119,7 +4131,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - &open_stp); + &open_stp, nn); if (status) goto out; open_sop = openowner(open_stp->st_stateowner); @@ -4133,7 +4145,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, - NFS4_LOCK_STID, &lock_stp); + NFS4_LOCK_STID, &lock_stp, nn); if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); @@ -4144,10 +4156,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfserr_grace; - if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) + if (locks_in_grace(net) && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) + if (!locks_in_grace(net) && lock->lk_reclaim) goto out; file_lock = locks_alloc_lock(); @@ -4333,7 +4345,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock *file_lock = NULL; __be32 status; int err; - + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", (long long) locku->lu_offset, (long long) locku->lu_length); @@ -4344,7 +4357,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, NFS4_LOCK_STID, &stp); + &locku->lu_stateid, NFS4_LOCK_STID, + &stp, nn); if (status) goto out; filp = find_any_file(stp->st_file); @@ -4564,10 +4578,9 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) * Called from OPEN. Look for clientid in reclaim list. */ __be32 -nfs4_check_open_reclaim(clientid_t *clid, bool sessions) +nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); /* find clientid in conf_id_hashtbl */ clp = find_confirmed_client(clid, sessions, nn); @@ -4583,7 +4596,7 @@ void nfsd_forget_clients(u64 num) { struct nfs4_client *clp, *next; int count = 0; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); nfs4_lock_state(); list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { @@ -4897,8 +4910,8 @@ __nfs4_state_shutdown(struct net *net) unhash_delegation(dp); } - nfsd4_client_tracking_exit(&init_net); - put_net(&init_net); + nfsd4_client_tracking_exit(net); + put_net(net); } void diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 26a912cdfe0c..bfe0106333cc 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -470,7 +470,7 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *) extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); extern void nfs4_free_openowner(struct nfs4_openowner *); extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 71c5c47f2750..3c414c1be295 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -581,7 +581,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *, extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, - struct nfsd4_open *open); + struct nfsd4_open *open, struct nfsd_net *nn); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); -- cgit v1.2.1 From 12760c6685624d65f8de078485c21b6a08e83409 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:22:12 +0300 Subject: nfsd: pass nfsd_net instead of net to grace enders Passing net context looks as overkill. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 14 ++++++-------- fs/nfsd/nfs4state.c | 8 +++----- fs/nfsd/state.h | 2 +- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 376692ab1b3b..b657b622bf5d 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops { void (*create)(struct nfs4_client *); void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); - void (*grace_done)(struct net *, time_t); + void (*grace_done)(struct nfsd_net *, time_t); }; /* Globals */ @@ -391,10 +391,9 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) } static void -nfsd4_recdir_purge_old(struct net *net, time_t boot_time) +nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) { int status; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); in_grace = false; if (!rec_file) @@ -1017,11 +1016,10 @@ nfsd4_cld_check(struct nfs4_client *clp) } static void -nfsd4_cld_grace_done(struct net *net, time_t boot_time) +nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) { int ret; struct cld_upcall *cup; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(cn); @@ -1241,7 +1239,7 @@ nfsd4_umh_cltrack_check(struct nfs4_client *clp) } static void -nfsd4_umh_cltrack_grace_done(struct net __attribute__((unused)) *net, +nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, time_t boot_time) { char *legacy; @@ -1343,10 +1341,10 @@ nfsd4_client_record_check(struct nfs4_client *clp) } void -nfsd4_record_grace_done(struct net *net, time_t boot_time) +nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) { if (client_tracking_ops) - client_tracking_ops->grace_done(net, boot_time); + client_tracking_ops->grace_done(nn, boot_time); } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 996a8a58944d..2e4ed691255a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3157,17 +3157,15 @@ out: } static void -nfsd4_end_grace(struct net *net) +nfsd4_end_grace(struct nfsd_net *nn) { - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - /* do nothing if grace period already ended */ if (nn->grace_ended) return; dprintk("NFSD: end of grace period\n"); nn->grace_ended = true; - nfsd4_record_grace_done(net, nn->boot_time); + nfsd4_record_grace_done(nn, nn->boot_time); locks_end_grace(&nn->nfsd4_manager); /* * Now that every NFSv4 client has had the chance to recover and @@ -3192,7 +3190,7 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - nfsd4_end_grace(&init_net); + nfsd4_end_grace(nn); INIT_LIST_HEAD(&reaplist); spin_lock(&client_lock); list_for_each_safe(pos, next, &nn->client_lru) { diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index bfe0106333cc..2deb6a88e58e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -495,5 +495,5 @@ extern void nfsd4_client_tracking_exit(struct net *net); extern void nfsd4_client_record_create(struct nfs4_client *clp); extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); -extern void nfsd4_record_grace_done(struct net *net, time_t boot_time); +extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); #endif /* NFSD4_STATE_H */ -- cgit v1.2.1 From 0912128149e86b48ed946371298d7fe61120d627 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 14 Nov 2012 18:22:17 +0300 Subject: nfsd: make laundromat network namespace aware This patch moves laundromat_work to nfsd per-net context, thus allowing to run multiple laundries. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4state.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index a356ea3dc686..227b93ebb622 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -79,6 +79,8 @@ struct nfsd_net { */ struct list_head client_lru; struct list_head close_lru; + + struct delayed_work laundromat_work; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2e4ed691255a..e75872f81e1c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3176,7 +3176,7 @@ nfsd4_end_grace(struct nfsd_net *nn) } static time_t -nfs4_laundromat(void) +nfs4_laundromat(struct nfsd_net *nn) { struct nfs4_client *clp; struct nfs4_openowner *oo; @@ -3185,7 +3185,6 @@ nfs4_laundromat(void) time_t cutoff = get_seconds() - nfsd4_lease; time_t t, clientid_val = nfsd4_lease; time_t u, test_val = nfsd4_lease; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); @@ -3251,16 +3250,19 @@ nfs4_laundromat(void) static struct workqueue_struct *laundry_wq; static void laundromat_main(struct work_struct *); -static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main); static void -laundromat_main(struct work_struct *not_used) +laundromat_main(struct work_struct *laundry) { time_t t; + struct delayed_work *dwork = container_of(laundry, struct delayed_work, + work); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + laundromat_work); - t = nfs4_laundromat(); + t = nfs4_laundromat(nn); dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); - queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); + queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) @@ -4791,6 +4793,8 @@ static int nfs4_state_start_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); + INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + return 0; err_sessionid: @@ -4875,7 +4879,8 @@ nfs4_state_start(void) ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; - queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); + + queue_delayed_work(laundry_wq, &nn->laundromat_work, nfsd4_grace * HZ); set_max_delegations(); return 0; out_free_laundry: @@ -4918,7 +4923,7 @@ nfs4_state_shutdown(void) struct net *net = &init_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - cancel_delayed_work_sync(&laundromat_work); + cancel_delayed_work_sync(&nn->laundromat_work); destroy_workqueue(laundry_wq); locks_end_grace(&nn->nfsd4_manager); nfs4_lock_state(); -- cgit v1.2.1 From 7dd2517c39c1334c9431c0732487e16f752ca09a Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Thu, 8 Nov 2012 16:10:17 -0800 Subject: fs/debugsfs: remove unnecessary inode->i_private initialization inode->i_private is promised to be NULL on allocation, no need to set it explicitly. Signed-off-by: Yan Hong Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b607d92cdf24..153bb1e42e63 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev case S_IFDIR: inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_private = NULL; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ -- cgit v1.2.1 From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:01 +1100 Subject: xfs: make buffer read verication an IO completion function Add a verifier function callback capability to the buffer read interfaces. This will be used by the callers to supply a function that verifies the contents of the buffer when it is read from disk. This patch does not provide callback functions, but simply modifies the interfaces to allow them to be called. The reason for adding this to the read interfaces is that it is very difficult to tell fom the outside is a buffer was just read from disk or whether we just pulled it out of cache. Supplying a callbck allows the buffer cache to use it's internal knowledge of the buffer to execute it only when the buffer is read from disk. It is intended that the verifier functions will mark the buffer with an EFSCORRUPTED error when verification fails. This allows the reading context to distinguish a verification error from an IO error, and potentially take further actions on the buffer (e.g. attempt repair) based on the error reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_btree.c | 21 ++++++++++++--------- fs/xfs/xfs_buf.c | 13 +++++++++---- fs/xfs/xfs_buf.h | 20 ++++++++++++-------- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_fsops.c | 4 ++-- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_log_recover.c | 8 +++++--- fs/xfs/xfs_mount.c | 6 +++--- fs/xfs/xfs_qm.c | 5 +++-- fs/xfs/xfs_rtalloc.c | 6 +++--- fs/xfs/xfs_trans.h | 19 ++++++++----------- fs/xfs/xfs_trans_buf.c | 9 ++++++--- fs/xfs/xfs_vnodeops.c | 2 +- 19 files changed, 75 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 335206a9c698..21c3db08fd01 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -447,7 +447,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2110,7 +2110,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 55bbe98e8f82..474c57a43cce 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1994,7 +1994,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 121ea99e615a..7e791160092d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,12 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, NULL); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -624,10 +627,10 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, NULL); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -650,7 +653,7 @@ xfs_btree_reada_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } /* @@ -670,7 +673,7 @@ xfs_btree_reada_bufs( ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } STATIC int @@ -1013,7 +1016,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4b0b8dd1b7b0..0298dd684798 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -654,7 +654,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { struct xfs_buf *bp; @@ -666,6 +667,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_iodone = verify; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +693,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + xfs_buf_iodone_t verify) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); } /* @@ -709,7 +712,8 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; int error; @@ -723,6 +727,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7c0b6a0a1557..677b1dc822f4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -100,6 +100,7 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 struct xfs_buf_map { @@ -159,7 +160,6 @@ typedef struct xfs_buf { #endif } xfs_buf_t; - /* Finding and Reading Buffers */ struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -196,9 +196,10 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); + xfs_buf_flags_t flags, xfs_buf_iodone_t verify); void xfs_buf_readahead_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps); + struct xfs_buf_map *map, int nmaps, + xfs_buf_iodone_t verify); static inline struct xfs_buf * xfs_buf_get( @@ -216,20 +217,22 @@ xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags); + return xfs_buf_read_map(target, &map, 1, flags, verify); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1); + return xfs_buf_readahead_map(target, &map, 1, verify); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -239,7 +242,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + xfs_buf_iodone_t verify); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c62e7e6ff50e..4af8bad7068c 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2161,7 +2161,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp); + mapp, nmap, 0, &bp, NULL); if (error) goto out_free; @@ -2237,7 +2237,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0b296253bd01..bac86984e403 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -926,7 +926,7 @@ xfs_dir2_leaf_readbuf( XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + mip->ra_offset), - (int)BTOBB(mp->m_dirblksize)); + (int)BTOBB(mp->m_dirblksize), NULL); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca4843..e95f800333d4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -439,7 +439,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); + 0, &bp, NULL); if (error || !bp) return XFS_ERROR(error); } @@ -920,7 +920,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index bd9cb7f0b073..5440768ec41c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -168,7 +168,7 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -439,7 +439,7 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 37753e1c8537..12e3dead439d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1490,7 +1490,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7449cb943efd..8d6963010489 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -408,7 +408,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); + (int)imap->im_len, buf_flags, &bp, NULL); if (error) { if (error != EAGAIN) { xfs_warn(mp, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46b6986e39b0..1d6d2ee08495 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1129,8 +1129,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3e06333d4bd1..eb1e29ff0c7c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2144,7 +2144,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2237,7 +2237,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2548,7 +2549,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 41ae7e1590f5..d5402b0eb6a3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -652,7 +652,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, NULL); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1002,7 +1002,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1017,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 48c750b0e830..688f608b3668 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) break; @@ -979,7 +979,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a69e0b4750a9..b271ed939d7b 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -870,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1873,7 +1873,7 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2220,7 +2220,7 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index db056544cbb5..f02d40296506 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -464,10 +464,7 @@ xfs_trans_get_buf( int numblks, uint flags) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_get_buf_map(tp, target, &map, 1, flags); } @@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp); + struct xfs_buf **bpp, + xfs_buf_iodone_t verify); static inline int xfs_trans_read_buf( @@ -486,13 +484,12 @@ xfs_trans_read_buf( xfs_daddr_t blkno, int numblks, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; - return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, verify); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6311b99c267f..977628207b45 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -257,7 +257,8 @@ xfs_trans_read_buf_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -265,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -312,7 +313,9 @@ xfs_trans_read_buf_map( if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); + bp->b_iodone = verify; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -349,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 81c61fd17890..26880793feca 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -80,7 +80,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; -- cgit v1.2.1 From eab4e63368b4cfa597dbdac66d1a7a836a693b7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:02 +1100 Subject: xfs: uncached buffer reads need to return an error With verification being done as an IO completion callback, different errors can be returned from a read. Uncached reads only return a buffer or NULL on failure, which means the verification error cannot be returned to the caller. Split the error handling for these reads into two - a failure to get a buffer will still return NULL, but a read error will return a referenced buffer with b_error set rather than NULL. The caller is responsible for checking the error state of the buffer returned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 9 ++------- fs/xfs/xfs_fsops.c | 5 +++++ fs/xfs/xfs_mount.c | 6 ++++++ fs/xfs/xfs_rtalloc.c | 9 ++++++++- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0298dd684798..fbc965fc075a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -715,8 +715,7 @@ xfs_buf_read_uncached( int flags, xfs_buf_iodone_t verify) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) @@ -730,11 +729,7 @@ xfs_buf_read_uncached( bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5440768ec41c..f35f8d7731f0 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -171,6 +171,11 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5402b0eb6a3..df6d0b2aade1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,6 +658,12 @@ reread: xfs_warn(mp, "SB buffer read failed"); return EIO; } + if (bp->b_error) { + error = bp->b_error; + if (loud) + xfs_warn(mp, "SB validate failed"); + goto release_buf; + } /* * Initialize the mount structure from the superblock. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b271ed939d7b..98dc670d3ee0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1876,6 +1876,11 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); /* @@ -2221,8 +2226,10 @@ xfs_rtmount_init( bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { + if (!bp || bp->b_error) { xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); return EIO; } xfs_buf_relse(bp); -- cgit v1.2.1 From 98021821a502db347bd9c7671beeee6e8ce07ea6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:03 +1100 Subject: xfs: verify superblocks as they are read from disk Add a superblock verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Adding verification shows that secondary superblocks never have their "sb_inprogress" flag cleared by mkfs.xfs, so when validating the secondary superblocks during a grow operation we have to avoid checking this field. Even if we fix mkfs, we will still have to ignore this field for verification purposes unless a version of mkfs that does not have this bug was used. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 4 +- fs/xfs/xfs_log_recover.c | 5 ++- fs/xfs/xfs_mount.c | 98 ++++++++++++++++++++++++++++++------------------ fs/xfs/xfs_mount.h | 3 +- 4 files changed, 69 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f35f8d7731f0..cb65b067ed31 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -444,7 +444,8 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, + xfs_sb_read_verify); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), @@ -462,6 +463,7 @@ xfs_growfs_data_private( break; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index eb1e29ff0c7c..924a4bc3d49a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3692,13 +3692,14 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); + bp->b_iodone = xfs_sb_read_verify; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -3710,7 +3711,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index df6d0b2aade1..bff18d73c610 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -304,9 +304,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - int flags) + bool check_inprogress) { - int loud = !(flags & XFS_MFSI_QUIET); /* * If the log device and data device have the @@ -316,21 +315,18 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - if (loud) - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!xfs_sb_good_version(sbp)) { - if (loud) - xfs_warn(mp, "bad version"); + xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -338,8 +334,7 @@ xfs_mount_validate_sb( if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -373,8 +368,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - if (loud) - XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_CORRUPTION_ERROR("SB sanity check failed", XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -383,12 +377,10 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - if (loud) { - xfs_warn(mp, + xfs_warn(mp, "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - } return XFS_ERROR(ENOSYS); } @@ -402,23 +394,20 @@ xfs_mount_validate_sb( case 2048: break; default: - if (loud) - xfs_warn(mp, "inode size of %d bytes not supported", + xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return XFS_ERROR(ENOSYS); } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "file system too large to be mounted on this system."); return XFS_ERROR(EFBIG); } - if (unlikely(sbp->sb_inprogress)) { - if (loud) - xfs_warn(mp, "file system busy"); + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } @@ -426,9 +415,7 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - if (loud) - xfs_warn(mp, - "file system using version 1 directory format"); + xfs_warn(mp, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -521,11 +508,9 @@ out_unwind: void xfs_sb_from_disk( - struct xfs_mount *mp, + struct xfs_sb *to, xfs_dsb_t *from) { - struct xfs_sb *to = &mp->m_sb; - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -627,6 +612,50 @@ xfs_sb_to_disk( } } +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + int error; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + if (error) + xfs_buf_ioerror(bp, error); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, the run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + if (sb.sb_magicnum == XFS_SB_MAGIC) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EFSCORRUPTED); +} + /* * xfs_readsb * @@ -652,7 +681,9 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, NULL); + BTOBB(sector_size), 0, + loud ? xfs_sb_read_verify + : xfs_sb_quiet_read_verify); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -667,15 +698,8 @@ reread: /* * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. */ - xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { - if (loud) - xfs_warn(mp, "SB validate failed"); - goto release_buf; - } + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); /* * We must be able to do sector-sized and sector-aligned IO. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dc306a09f56f..de9089acc610 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,10 +385,11 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_read_verify(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); #endif /* __XFS_MOUNT_H__ */ -- cgit v1.2.1 From 5d5f527d13369d0047d52b7ac4ddee4f8c0ad173 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:44:56 +1100 Subject: xfs: verify AGF blocks as they are read from disk Add an AGF block verify callback function and pass it into the buffer read functions. This replaces the existing verification that is done after the read completes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 68 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 21c3db08fd01..c9eb955a49c7 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2091,6 +2091,47 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_read_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (free/alloc section). */ @@ -2102,44 +2143,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } -- cgit v1.2.1 From 3702ce6ed71cd60451ab278088863456dcb0dd99 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:05 +1100 Subject: xfs: verify AGI blocks as they are read from disk Add an AGI block verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 56 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 12e3dead439d..5bd255e5f7b8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1472,6 +1472,40 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int agi_ok; + + /* + * Validate the magic number of the agi block. + */ + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == + bp->b_pag->pag_agno; + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_check_agi_unlinked(agi); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (inode allocation section) */ @@ -1482,38 +1516,18 @@ xfs_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp) /* allocation group hdr buf */ { - struct xfs_agi *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - agi = XFS_BUF_TO_AGI(*bpp); - - /* - * Validate the magic number of the agi block. - */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && - be32_to_cpu(agi->agi_seqno) == agno; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_buf_set_ref(*bpp, XFS_AGI_REF); - - xfs_check_agi_unlinked(agi); return 0; } -- cgit v1.2.1 From bb80c6d79a3b0f9b6c3236a4bec021c72615bfd1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:06 +1100 Subject: xfs: verify AGFL blocks as they are read from disk Add an AGFL block verify callback function and pass it into the buffer read functions. While this commit adds verification code to the AGFL, it cannot be used reliably until the CRC format change comes along as mkfs does not initialise the full AGFL. Hence it can be full of garbage at the first mount and will fail verification right now. CRC enabled filesystems won't have this problem, so leave the code that has already been written ifdef'd out until the proper time. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index c9eb955a49c7..38b4ab8957ff 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,43 @@ xfs_alloc_fixup_trees( return 0; } +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group free block array. */ @@ -447,7 +484,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); -- cgit v1.2.1 From af133e8606d32c2aed43870491ebbdc56feec8a8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:07 +1100 Subject: xfs: verify inode buffers as they are read from disk Add an inode buffer verify callback function and pass it into the buffer read functions. Inodes are special in that the verbose checks will be done when reading the inode, but we still need to sanity check the buffer when that is first read. Always verify the magic numbers in all inodes in the buffer, rather than jus ton debug kernels. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 100 +++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d6963010489..514eac913f1c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,6 +382,46 @@ xfs_inobp_check( } #endif +static void +xfs_inode_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + } + } + xfs_inobp_check(mp, bp); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -396,71 +436,33 @@ xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, + struct xfs_dinode **dipp, struct xfs_buf **bpp, uint buf_flags, uint iget_flags) { struct xfs_buf *bp; int error; - int i; - int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, NULL); + (int)imap->im_len, buf_flags, &bp, + xfs_inode_buf_verify); if (error) { - if (error != EAGAIN) { - xfs_warn(mp, - "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - } else { + if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); + return error; } - return error; - } - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - */ -#ifdef DEBUG - ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_UNTRUSTED) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)imap->im_blkno, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; } - xfs_inobp_check(mp, bp); - *bpp = bp; *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; -- cgit v1.2.1 From 3d3e6f64e22c94115d47de670611bcd3ecda3796 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:08 +1100 Subject: xfs: verify btree blocks as they are read from disk Add an btree block verify callback function and pass it into the buffer read functions. Because each different btree block type requires different verification, add a function to the ops structure that is called from the generic code. Also, propagate the verification callback functions through the readahead functions, and into the external bmap and bulkstat inode readahead code that uses the generic btree buffer read functions. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 61 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap.c | 60 +++++++++++++++++++++++++----------------- fs/xfs/xfs_bmap_btree.c | 47 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 66 ++++++++++++++++++++++++----------------------- fs/xfs/xfs_btree.h | 10 ++++--- fs/xfs/xfs_ialloc_btree.c | 40 ++++++++++++++++++++++++++++ fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_itable.c | 3 ++- 10 files changed, 230 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f7876c6d6165..46961e52e9b8 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,6 +272,66 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -327,6 +387,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .read_verify = xfs_allocbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a60f3d1f151c..9ae7aba52e0f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) @@ -4078,8 +4079,9 @@ xfs_bmap_read_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( @@ -4124,7 +4126,8 @@ xfs_bmap_read_extents( */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); + xfs_btree_reada_bufl(mp, nextbno, 1, + xfs_bmbt_read_verify); /* * Copy records into the extent records. */ @@ -4156,8 +4159,9 @@ xfs_bmap_read_extents( */ if (bno == NULLFSBLOCK) break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); } @@ -5868,15 +5872,16 @@ xfs_bmap_check_leaf_extents( */ while (level-- > 0) { /* See if buf is in cur first */ + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( xfs_bmap_sanity_check(mp, bp, level), @@ -5953,15 +5958,16 @@ xfs_bmap_check_leaf_extents( if (bno == NULLFSBLOCK) break; + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { @@ -6052,7 +6058,9 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); @@ -6061,8 +6069,10 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; nextblock = XFS_BUF_TO_BLOCK(nbp); @@ -6091,8 +6101,10 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 862084a47a7e..bddca9b92869 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -36,6 +36,7 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_trace.h" /* * Determine the extent state. @@ -707,6 +708,51 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int lblock_ok; /* block passes checks */ + + /* magic number and level verification. + * + * We don't know waht fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && + level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); + + /* numrecs verification */ + lblock_ok = lblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + + /* sibling pointer verification */ + lblock_ok = lblock_ok && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (!lblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -746,6 +792,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .read_verify = xfs_bmbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e66c4ea0f85..1d00fbe9dd79 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,6 +232,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern void xfs_bmbt_read_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 7e791160092d..ef1066078c33 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -270,7 +270,8 @@ xfs_btree_dup_cursor( if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, - 0, &bp, NULL); + 0, &bp, + cur->bc_ops->read_verify); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -612,23 +613,24 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify) +{ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, NULL); + mp->m_bsize, lock, &bp, verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -645,15 +647,16 @@ xfs_btree_read_bufl( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } /* @@ -663,17 +666,18 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } STATIC int @@ -687,12 +691,14 @@ xfs_btree_readahead_lblock( xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1); + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1); + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->read_verify); rval++; } @@ -712,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1); + left, 1, cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1); + right, 1, cur->bc_ops->read_verify); rval++; } @@ -1016,19 +1022,15 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, NULL); + mp->m_bsize, flags, bpp, + cur->bc_ops->read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); - - error = xfs_btree_check_block(cur, *block, level, *bpp); - if (error) - xfs_trans_brelse(cur->bc_tp, *bpp); - return error; + return 0; } /* diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index c9cf2d00e236..3a4c314047a0 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,6 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + void (*read_verify)(struct xfs_buf *bp); #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, @@ -355,7 +356,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -365,7 +367,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -376,7 +379,8 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 2b8b7a37aa18..11306c6d61c7 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC int @@ -181,6 +182,44 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* magic number and level verification */ + level = be16_to_cpu(block->bb_level); + sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && + level < mp->m_in_maxlevels; + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -218,6 +257,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, + .read_verify = xfs_inobt_read_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 514eac913f1c..3a243d076950 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -static void +void xfs_inode_buf_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 21b4de3df716..1a892114792f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,6 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_inode_buf_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3998fd2a7949..0f18d412e3e8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -396,7 +396,8 @@ xfs_bulkstat( if (xfs_inobt_maskn(chunkidx, nicluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); + agbno, nbcluster, + xfs_inode_buf_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; -- cgit v1.2.1 From c6319198702350a2215a8c0cacd6cc4283728a1b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:50:13 +1100 Subject: xfs: verify dquot blocks as they are read from disk Add a dquot buffer verify callback function and pass it into the buffer read functions. This checks all the dquots in a buffer, but cannot completely verify the dquot ids are correct. Also, errors cannot be repaired, so an additional function is added to repair bad dquots in the buffer if such an error is detected in a context where repair is allowed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dquot.c | 117 +++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_dquot.h | 1 + fs/xfs/xfs_qm.c | 3 +- 3 files changed, 98 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e95f800333d4..0ba0f0992d6e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,6 +360,89 @@ xfs_qm_dqalloc( return (error); } +void +xfs_dquot_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", + XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_qm_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} + /* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot @@ -378,7 +461,6 @@ xfs_qm_dqtobp( xfs_buf_t *bp; xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); xfs_mount_t *mp = dqp->q_mount; - xfs_disk_dquot_t *ddq; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); @@ -439,33 +521,24 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, NULL); - if (error || !bp) - return XFS_ERROR(error); - } + 0, &bp, xfs_dquot_read_verify); - ASSERT(xfs_buf_islocked(bp)); - - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = bp->b_addr + dqp->q_bufoffset; + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } - /* - * A simple sanity check in case we got a corrupted dquot... - */ - error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp"); - if (error) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); + if (error) { + ASSERT(bp == NULL); + return XFS_ERROR(error); } } + ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; - *O_ddpp = ddq; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 7d20af27346d..a08ba92d7da0 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,6 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); +extern void xfs_dquot_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 688f608b3668..a6dfb97490cc 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,8 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + xfs_dquot_read_verify); if (error) break; -- cgit v1.2.1 From 4bb20a83a2a5ac4dcb62780c9950e47939956126 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:10 +1100 Subject: xfs: add verifier callback to directory read code Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 23 ++++++++++++----------- fs/xfs/xfs_attr_leaf.c | 18 +++++++++--------- fs/xfs/xfs_da_btree.c | 44 ++++++++++++++++++++++++++++---------------- fs/xfs/xfs_da_btree.h | 7 ++++--- fs/xfs/xfs_dir2_block.c | 23 ++++++++++++----------- fs/xfs/xfs_dir2_leaf.c | 33 ++++++++++++++++----------------- fs/xfs/xfs_dir2_node.c | 43 ++++++++++++++++++++----------------------- fs/xfs/xfs_file.c | 2 +- 8 files changed, 102 insertions(+), 91 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 474c57a43cce..cd5a9cd0ded0 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -904,7 +904,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1032,7 +1032,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * remove the "old" attr from that block (neat, huh!) */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1101,7 +1101,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -1159,7 +1159,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1190,7 +1190,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, + NULL); if (error) return XFS_ERROR(error); ASSERT(bp != NULL); @@ -1605,7 +1606,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) state->path.blk[0].bp = NULL; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == @@ -1718,7 +1719,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1737,7 +1738,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1827,7 +1828,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) bp = NULL; if (cursor->blkno > 0) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1870,7 +1871,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) for (;;) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely(bp == NULL)) { @@ -1937,7 +1938,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely((bp == NULL))) { diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 4bfc732bc9c9..ba2b9a2cd236 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -871,7 +871,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) if (error) goto out; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT(bp1 != NULL); @@ -1642,7 +1642,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + blkno, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -2519,7 +2519,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2584,7 +2584,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2641,7 +2641,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "old" attr */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2652,7 +2652,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ if (args->blkno2 != args->blkno) { error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); + -1, &bp2, XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2753,7 +2753,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2839,7 +2839,7 @@ xfs_attr_node_inactive( * before we come back to this one. */ error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (child_bp) { @@ -2880,7 +2880,7 @@ xfs_attr_node_inactive( */ if ((i+1) < count) { error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4af8bad7068c..f9e9149de009 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -747,7 +747,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork); + args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -838,7 +838,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork); + blkno, -1, &bp, state->args->whichfork, + NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1084,7 +1085,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork); + -1, &blk->bp, args->whichfork, NULL); if (error) { blk->blkno = 0; state->path.active--; @@ -1247,7 +1248,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1268,7 +1269,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1368,7 +1369,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1385,7 +1386,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1470,7 +1471,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + &blk->bp, args->whichfork, NULL); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1733,7 +1734,8 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + if (error) return error; /* * Copy the last block into the dead buffer and log it. @@ -1759,7 +1761,9 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1780,7 +1784,9 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1803,7 +1809,9 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != @@ -1853,7 +1861,9 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely( @@ -2139,7 +2149,8 @@ xfs_da_read_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2161,7 +2172,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, NULL); + mapp, nmap, 0, &bp, verifier); if (error) goto out_free; @@ -2217,7 +2228,8 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 132adafb041e..bf8bfaa0d356 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -18,7 +18,6 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; struct xfs_mount; @@ -226,9 +225,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork, + xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + xfs_dablk_t bno, int whichfork, + xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e93ca8f054f4..53666ca6c953 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -97,10 +97,10 @@ xfs_dir2_block_addname( /* * Read the (one and only) directory block into dabuf bp. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; /* @@ -457,7 +457,7 @@ xfs_dir2_block_getdents( * Can't read the block, give up, else get dabuf in bp. */ error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); if (error) return error; @@ -640,10 +640,10 @@ xfs_dir2_block_lookup_int( /* * Read the buffer, return error if we can't get it. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); @@ -917,10 +917,11 @@ xfs_dir2_leaf_to_block( /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - return error; + if (!dbp) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) + return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index bac86984e403..86e3dc1de0e7 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -315,10 +315,9 @@ xfs_dir2_leaf_addname( * Read the leaf block. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(lbp != NULL); /* * Look up the entry by hash value and name. @@ -500,9 +499,9 @@ xfs_dir2_leaf_addname( * Just read that one in. */ else { - if ((error = - xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), + -1, &dbp, XFS_DATA_FORK, NULL); + if (error) { xfs_trans_brelse(tp, lbp); return error; } @@ -895,7 +894,7 @@ xfs_dir2_leaf_readbuf( error = xfs_da_read_buf(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); /* * Should just skip over the data block instead of giving up. @@ -938,7 +937,7 @@ xfs_dir2_leaf_readbuf( xfs_da_reada_buf(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); mip->ra_current = i; } @@ -1376,7 +1375,7 @@ xfs_dir2_leaf_lookup_int( * Read the leaf block into the buffer. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); if (error) return error; *lbpp = lbp; @@ -1411,7 +1410,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1453,7 +1452,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1738,10 +1737,10 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -1864,10 +1863,10 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 6c7052406605..290c2b1016ab 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -399,7 +399,7 @@ xfs_dir2_leafn_lookup_for_addname( */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; free = curbp->b_addr; @@ -536,7 +536,7 @@ xfs_dir2_leafn_lookup_for_entry( } else { error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; } @@ -915,10 +915,10 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), + -1, &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == @@ -1169,11 +1169,10 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); /* * Count bytes in the two blocks combined. @@ -1454,14 +1453,13 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; @@ -1520,9 +1518,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; /* @@ -1631,7 +1629,7 @@ xfs_dir2_node_addname_int( * Read the data block in. */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) return error; hdr = dbp->b_addr; @@ -1917,11 +1915,10 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c42f99e71f14..f6dab7da7bcc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -891,7 +891,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); xfs_iunlock(ip, mode); return 0; } -- cgit v1.2.1 From 20f7e9f3726a27cccade65c28265eef8ca50eecb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:11 +1100 Subject: xfs: factor dir2 block read operations In preparation for verifying dir2 block format buffers, factor the read operations out of the block operations (lookup, addname, getdents) and some of the additional logic to make it easier to understand an dmodify the code. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 386 ++++++++++++++++++++++++++---------------------- 1 file changed, 209 insertions(+), 177 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 53666ca6c953..25ce409487be 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,178 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static int +xfs_dir2_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + + return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, NULL); +} + +static void +xfs_dir2_block_need_space( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = hdr->bestfree; + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); +} + /* * Add an entry to a block directory. */ @@ -63,7 +235,6 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ @@ -94,134 +265,44 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); - hdr = bp->b_addr; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + len = xfs_dir2_data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = hdr->bestfree; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + /* - * No stale entries? Need space for entry and new leaf. - */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } - /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? - */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - compact = 0; - } - /* - * Will need to compact to make this work. + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; - } + xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * If this isn't a real add, we're done with the buffer. + * Done everything we need for a space check now. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; + } + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -232,65 +313,24 @@ xfs_dir2_block_addname( return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. - */ - if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (blp[fromidx].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); - /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. - */ - if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. */ + if (compact) + xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -450,18 +490,13 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(NULL, dp, &bp); if (error) return error; - ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -637,14 +672,11 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); -- cgit v1.2.1 From 82025d7f79148fe66a1594a0ebe4ab38152cf9e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:12 +1100 Subject: xfs: verify dir2 block format buffers Add a dir2 block format read verifier. To fully verify every block when read, call xfs_dir2_data_check() on them. Change xfs_dir2_data_check() to do runtime checking, convert ASSERT() checks to XFS_WANT_CORRUPTED_RETURN(), which will trigger an ASSERT failure on debug kernels, but on production kernels will dump an error to dmesg and return EFSCORRUPTED to the caller. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 22 ++++++++++++++- fs/xfs/xfs_dir2_data.c | 73 +++++++++++++++++++++++++++++-------------------- fs/xfs/xfs_dir2_priv.h | 4 ++- 3 files changed, 68 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 25ce409487be..57351b868861 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,26 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static void +xfs_dir2_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + static int xfs_dir2_block_read( struct xfs_trans *tp, @@ -65,7 +85,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, NULL); + XFS_DATA_FORK, xfs_dir2_block_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 44ffd4d6bc91..cb117234e32e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -34,14 +34,13 @@ STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( +int +__xfs_dir2_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -64,18 +63,23 @@ xfs_dir2_data_check( int stale; /* count of stale leaves */ struct xfs_name name; - mp = dp->i_mount; + mp = bp->b_target->bt_mount; hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else { - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; } count = lastfree = freeseen = 0; @@ -83,19 +87,22 @@ xfs_dir2_data_check( * Account for zero bestfree entries. */ if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -107,17 +114,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -130,10 +140,12 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { @@ -148,27 +160,30 @@ xfs_dir2_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; } -#endif /* * Given a data block and an unused entry from that block, diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 3523d3e15aa8..93b8f66ae788 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -41,10 +41,12 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); #else #define xfs_dir2_data_check(dp,bp) #endif +extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); + extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); -- cgit v1.2.1 From 2025207ca6738a1217126ef14af9d104433f9824 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:13 +1100 Subject: xfs: factor dir2 free block reading Also factor out the updating of the free block when removing entries from leaf blocks, and add a verifier callback for reads. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 3 +- fs/xfs/xfs_dir2_node.c | 218 +++++++++++++++++++++++++++++++------------------ fs/xfs/xfs_dir2_priv.h | 2 + 3 files changed, 143 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 86e3dc1de0e7..6c1359dc9898 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1863,8 +1863,7 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); if (error) return error; free = fbp->b_addr; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 290c2b1016ab..d7f899dfbff5 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -55,6 +55,57 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); +static void +xfs_dir2_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", + XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static int +__xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_free_verify); +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); +} + /* * Log entries from a freespace block. */ @@ -394,12 +445,10 @@ xfs_dir2_leafn_lookup_for_addname( */ if (curbp) xfs_trans_brelse(tp, curbp); - /* - * Read the free block. - */ - error = xfs_da_read_buf(tp, dp, + + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK, NULL); + &curbp); if (error) return error; free = curbp->b_addr; @@ -825,6 +874,77 @@ xfs_dir2_leafn_rebalance( } } +static int +xfs_dir2_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + struct xfs_trans *tp = args->trans; + int logfree = 0; + + if (!hdr) { + /* One less used entry in the free table. */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If this was the last entry in the table, we can trim the + * table size back. There might be other entries at the end + * referring to non-existent data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } else { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -908,15 +1028,14 @@ xfs_dir2_leafn_remove( xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + &fbp); if (error) return error; free = fbp->b_addr; @@ -954,68 +1073,12 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (hdr == NULL) { - /* - * One less used entry in the free table. - */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && - free->bests[i] == cpu_to_be16(NULLDATAOFF); - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + error = xfs_dir2_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } + xfs_dir2_leafn_check(dp, bp); /* * Return indication of whether this leaf block is empty enough @@ -1453,9 +1516,9 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; if (!fbp) @@ -1518,8 +1581,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; @@ -1915,17 +1979,15 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); if (error) return error; /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 93b8f66ae788..263a63287910 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -117,6 +117,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); -- cgit v1.2.1 From e4813572640e27d3a5cce3f06751a9f54f77aaa5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:14 +1100 Subject: xfs: factor out dir2 data block reading And add a verifier callback function while there. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 3 +-- fs/xfs/xfs_dir2_data.c | 32 ++++++++++++++++++++++++++++++++ fs/xfs/xfs_dir2_leaf.c | 38 +++++++++++++++++--------------------- fs/xfs/xfs_dir2_node.c | 8 ++++---- fs/xfs/xfs_dir2_priv.h | 2 ++ 5 files changed, 56 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 57351b868861..ca03b109772d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -970,8 +970,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index cb117234e32e..0ef04f1bf511 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,6 +185,38 @@ __xfs_dir2_data_check( return 0; } +static void +xfs_dir2_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_dir2_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 6c1359dc9898..0fdf765c917f 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -493,14 +493,14 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK, NULL); + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -508,7 +508,6 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ @@ -891,10 +890,9 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, + error = xfs_dir2_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK, NULL); + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); /* * Should just skip over the data block instead of giving up. @@ -1408,14 +1406,13 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* @@ -1450,9 +1447,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1737,8 +1734,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index d7f899dfbff5..67b811c17eaa 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -583,9 +583,9 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_da_read_buf(tp, dp, + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK, NULL); + -1, &curbp); if (error) return error; } @@ -1692,8 +1692,8 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp); if (error) return error; hdr = dbp->b_addr; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 263a63287910..71ec82839107 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -46,6 +46,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #define xfs_dir2_data_check(dp,bp) #endif extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, -- cgit v1.2.1 From e6f7667c4eef42b6f5bc6cdeb31d0bab62fe5f79 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:15 +1100 Subject: xfs: factor dir2 leaf read Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 73 ++++++++++++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_dir2_node.c | 6 ++--- fs/xfs/xfs_dir2_priv.h | 2 ++ 3 files changed, 67 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0fdf765c917f..97408e3287ed 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -48,6 +48,62 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void +xfs_dir2_leaf_verify( + struct xfs_buf *bp, + __be16 magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == magic; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_dir2_leaf1_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leafn_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +static int +xfs_dir2_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leaf1_verify); +} + +int +xfs_dir2_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leafn_verify); +} /* * Convert a block form directory to a leaf form directory. @@ -311,14 +367,11 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -1369,13 +1422,11 @@ xfs_dir2_leaf_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; + *lbpp = lbp; leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 67b811c17eaa..7c6f95697e28 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1232,11 +1232,11 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_DATA_FORK, NULL); + error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return error; - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 71ec82839107..4560825d099c 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -- cgit v1.2.1 From ad14c33ac862601c4c22755ed3b59f1906b134e5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:16 +1100 Subject: xfs: factor and verify attr leaf reads Some reads are not converted yet because it isn't obvious ahead of time what the format of the block is going to be. Need to determine how to tell if the first block in the tree is a node or leaf format block. That will be done in later patches. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 70 ++++++++++++-------------------------------- fs/xfs/xfs_attr_leaf.c | 78 +++++++++++++++++++++++++++++--------------------- fs/xfs/xfs_attr_leaf.h | 3 ++ 3 files changed, 66 insertions(+), 85 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cd5a9cd0ded0..d644915367e3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if @@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); return(error); } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1158,11 +1153,9 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { @@ -1183,25 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args) STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context) { - xfs_attr_leafblock_t *leaf; int error; struct xfs_buf *bp; trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, - NULL); + error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - ASSERT(bp != NULL); - leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } error = xfs_attr_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); @@ -1605,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1920,14 +1900,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != - cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } error = xfs_attr_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); @@ -1937,16 +1909,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } + return error; } xfs_trans_brelse(NULL, bp); return(0); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ba2b9a2cd236..357971536d50 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,6 +88,36 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +static void +xfs_attr_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_attr_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, xfs_attr_leaf_verify); +} + /*======================================================================== * Namespace helper routines *========================================================================*/ @@ -870,11 +900,10 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); + bp2 = NULL; error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, XFS_ATTR_FORK); @@ -1641,18 +1670,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); @@ -2518,15 +2545,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2583,15 +2606,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2640,35 +2659,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; leaf2 = bp2->b_addr; - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index dea17722945e..8f7ab986f45d 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,6 +227,9 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); /* * Routines used for growing the Btree. -- cgit v1.2.1 From d9392a4bb75503fc2adbb5237c3df940c6467eb2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:17 +1100 Subject: xfs: add xfs_da_node verification Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 22 ++++------ fs/xfs/xfs_attr_leaf.c | 12 +++--- fs/xfs/xfs_attr_leaf.h | 8 ++-- fs/xfs/xfs_da_btree.c | 109 +++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_da_btree.h | 3 ++ fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dir2_priv.h | 1 + 7 files changed, 107 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index d644915367e3..aaf472532b3c 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1696,10 +1696,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1715,10 +1715,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1807,8 +1807,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1849,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_read_buf(NULL, context->dp, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + XFS_ATTR_FORK); if (error) return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 357971536d50..efe170da2881 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void +void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -2765,7 +2765,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2850,8 +2850,8 @@ xfs_attr_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); if (error) return(error); if (child_bp) { @@ -2891,8 +2891,8 @@ xfs_attr_node_inactive( * child block number. */ if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 8f7ab986f45d..098e9a58ad9f 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,9 +227,6 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp); /* * Routines used for growing the Btree. @@ -264,4 +261,9 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr_leaf_verify(struct xfs_buf *bp); + #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index f9e9149de009..1b84fc50a053 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -91,6 +91,68 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); +static void +__xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_node_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); + block_ok = block_ok && + be16_to_cpu(hdr->level) > 0 && + be16_to_cpu(hdr->count) > 0 ; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + __xfs_da_node_verify(bp); + return; + case XFS_ATTR_LEAF_MAGIC: + xfs_attr_leaf_verify(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + xfs_dir2_leafn_verify(bp); + return; + default: + break; + } + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_da_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, xfs_da_node_verify); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -746,8 +808,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) */ child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -837,9 +899,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork, - NULL); + error = xfs_da_node_read(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1084,8 +1145,8 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; @@ -1246,9 +1307,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1267,9 +1328,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1367,9 +1428,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1384,9 +1445,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1470,8 +1531,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1734,7 +1795,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1761,8 +1822,7 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1784,8 +1844,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1809,8 +1868,7 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; @@ -1861,8 +1919,7 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index bf8bfaa0d356..2d1bec4b7595 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -213,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 97408e3287ed..67cc21c2a45d 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -74,7 +74,7 @@ xfs_dir2_leaf1_verify( xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } -static void +void xfs_dir2_leafn_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 4560825d099c..e0b96e7693ea 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, -- cgit v1.2.1 From da6958c873ecd846d71fafbfe0f6168bb9c2c99e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:18 +1100 Subject: xfs: Add verifiers to dir2 data readahead. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_da_btree.h | 4 ++-- fs/xfs/xfs_dir2_data.c | 13 ++++++++++++- fs/xfs/xfs_dir2_leaf.c | 11 +++++------ fs/xfs/xfs_dir2_priv.h | 2 ++ fs/xfs/xfs_file.c | 4 +++- 6 files changed, 26 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 1b84fc50a053..93ebc0fc6dd9 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2285,10 +2285,10 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mappedbno, int whichfork, xfs_buf_iodone_t verifier) { - xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; struct xfs_buf_map *mapp; int nmap; @@ -2296,7 +2296,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 2d1bec4b7595..521b008445ab 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -231,8 +231,8 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bpp, int whichfork, xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork, - xfs_buf_iodone_t verifier); + xfs_dablk_t bno, xfs_daddr_t mapped_bno, + int whichfork, xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 0ef04f1bf511..1a43c8593c00 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -static void +void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -217,6 +217,17 @@ xfs_dir2_data_read( XFS_DATA_FORK, xfs_dir2_data_verify); } +int +xfs_dir2_data_readahead( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 67cc21c2a45d..8a95547d42ac 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -972,11 +972,11 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + - mip->ra_offset), - (int)BTOBB(mp->m_dirblksize), NULL); + mip->ra_offset)); mip->ra_current = i; } @@ -985,10 +985,9 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_da_reada_buf(NULL, dp, + xfs_dir2_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + - mip->ra_offset, - XFS_DATA_FORK, NULL); + mip->ra_offset, -1); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index e0b96e7693ea..daf5d0fc6165 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -48,6 +48,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f6dab7da7bcc..400b187595bb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,8 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" @@ -891,7 +893,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); + xfs_dir2_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } -- cgit v1.2.1 From cfb02852226aa449fe27075caffe88726507668c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:19 +1100 Subject: xfs: add buffer pre-write callback Add a callback to the buffer write path to enable verification of the buffer and CRC calculation prior to issuing the write to the underlying storage. If the callback function detects some kind of failure or error condition, it must mark the buffer with an error so that the caller can take appropriate action. In the case of xfs_buf_ioapply(), a corrupt metadta buffer willt rigger a shutdown of the filesystem, because something is clearly wrong and we can't allow corrupt metadata to be written to disk. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++++++++++++++++ fs/xfs/xfs_buf.h | 3 +++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fbc965fc075a..bd1a948ee39c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -569,7 +569,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_pre_io = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -1323,6 +1325,20 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; + /* + * run the pre-io callback function if it exists. If this function + * fails it will mark the buffer with an error and the IO should + * not be dispatched. + */ + if (bp->b_pre_io) { + bp->b_pre_io(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 677b1dc822f4..51bc16a1cd9c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -155,6 +155,9 @@ typedef struct xfs_buf { unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + void (*b_pre_io)(struct xfs_buf *); + /* pre-io callback function */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif -- cgit v1.2.1 From 612cfbfe174a89d565363fff7f3961a2dda5fb71 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:52:32 +1100 Subject: xfs: add pre-write metadata buffer verifier callbacks These verifiers are essentially the same code as the read verifiers, but do not require ioend processing. Hence factor the read verifier functions and add a new write verifier wrapper that is used as the callback. This is done as one large patch for all verifiers rather than one patch per verifier as the change is largely mechanical. This includes hooking up the write verifier via the read verifier function. Hooking up the write verifier for buffers obtained via xfs_trans_get_buf() will be done in a separate patch as that touches code in many different places rather than just the verifier functions. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 35 ++++++++++++++++++++++++++++++++--- fs/xfs/xfs_alloc_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_attr_leaf.c | 19 +++++++++++++++++-- fs/xfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_bmap_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_da_btree.c | 37 +++++++++++++++++++++++++------------ fs/xfs/xfs_dir2_block.c | 16 +++++++++++++++- fs/xfs/xfs_dir2_data.c | 19 +++++++++++++++++-- fs/xfs/xfs_dir2_leaf.c | 31 ++++++++++++++++++++++++------- fs/xfs/xfs_dir2_node.c | 17 ++++++++++++++++- fs/xfs/xfs_dir2_priv.h | 2 +- fs/xfs/xfs_dquot.c | 27 +++++++++++++++++++++------ fs/xfs/xfs_dquot.h | 2 +- fs/xfs/xfs_ialloc.c | 17 ++++++++++++++++- fs/xfs/xfs_ialloc_btree.c | 19 ++++++++++++++++--- fs/xfs/xfs_inode.c | 19 +++++++++++++++++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_mount.c | 19 +++++++++++++++++-- fs/xfs/xfs_qm.c | 2 +- 20 files changed, 273 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 38b4ab8957ff..d12bbedf6fe5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,8 +430,8 @@ xfs_alloc_fixup_trees( return 0; } -void -xfs_agfl_read_verify( +static void +xfs_agfl_verify( struct xfs_buf *bp) { #ifdef WHEN_CRCS_COME_ALONG @@ -463,6 +463,21 @@ xfs_agfl_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } #endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); + bp->b_pre_io = xfs_agfl_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -2129,7 +2144,7 @@ xfs_alloc_put_freelist( } static void -xfs_agf_read_verify( +xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -2164,7 +2179,21 @@ xfs_agf_read_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} +void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); + bp->b_pre_io = xfs_agf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 46961e52e9b8..6e98b22ebde0 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,8 +272,8 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -void -xfs_allocbt_read_verify( +static void +xfs_allocbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -323,11 +323,24 @@ xfs_allocbt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); + bp->b_pre_io = xfs_allocbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index efe170da2881..57729d71ab1a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -void +static void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -101,11 +101,26 @@ xfs_attr_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_attr_leaf_write_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} +void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); + bp->b_pre_io = xfs_attr_leaf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_attr_leaf_read( struct xfs_trans *tp, @@ -115,7 +130,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_verify); + XFS_ATTR_FORK, xfs_attr_leaf_read_verify); } /*======================================================================== diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 098e9a58ad9f..3bbf6277e43c 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,6 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_verify(struct xfs_buf *bp); +void xfs_attr_leaf_read_verify(struct xfs_buf *bp); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bddca9b92869..17d7423e7503 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -708,8 +708,8 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -void -xfs_bmbt_read_verify( +static void +xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -744,11 +744,24 @@ xfs_bmbt_read_verify( if (!lblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); + bp->b_pre_io = xfs_bmbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 93ebc0fc6dd9..6bb0a59eaaee 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -92,7 +92,7 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); static void -__xfs_da_node_verify( +xfs_da_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -108,12 +108,17 @@ __xfs_da_node_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } static void -xfs_da_node_verify( +xfs_da_node_write_verify( + struct xfs_buf *bp) +{ + xfs_da_node_verify(bp); +} + +static void +xfs_da_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -121,21 +126,22 @@ xfs_da_node_verify( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: - __xfs_da_node_verify(bp); - return; + xfs_da_node_verify(bp); + break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_verify(bp); + xfs_attr_leaf_read_verify(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_verify(bp); + xfs_dir2_leafn_read_verify(bp); return; default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); - + bp->b_pre_io = xfs_da_node_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -150,7 +156,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_verify); + which_fork, xfs_da_node_read_verify); } /*======================================================================== @@ -816,7 +822,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the pre_io pointer as well to match the buffer type change + * that could occur. + */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + root_blk->bp->b_pre_io = bp->b_pre_io; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ca03b109772d..0f8793c74fe2 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -71,7 +71,21 @@ xfs_dir2_block_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_block_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); + bp->b_pre_io = xfs_dir2_block_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -85,7 +99,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_verify); + XFS_DATA_FORK, xfs_dir2_block_read_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 1a43c8593c00..b555585f5ab6 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -200,11 +200,26 @@ xfs_dir2_data_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} +void +xfs_dir2_data_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); + bp->b_pre_io = xfs_dir2_data_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_dir2_data_read( struct xfs_trans *tp, @@ -214,7 +229,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } int @@ -225,7 +240,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 8a95547d42ac..5b3bcab2a656 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -62,23 +62,40 @@ xfs_dir2_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_leaf1_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} +static void +xfs_dir2_leaf1_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + bp->b_pre_io = xfs_dir2_leaf1_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } static void -xfs_dir2_leaf1_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_write_verify( + struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void -xfs_dir2_leafn_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); } static int @@ -90,7 +107,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_verify); + XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); } int @@ -102,7 +119,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_verify); + XFS_DATA_FORK, xfs_dir2_leafn_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 7c6f95697e28..a58abe1fc0d0 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -69,11 +69,26 @@ xfs_dir2_free_verify( XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_free_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} +void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); + bp->b_pre_io = xfs_dir2_free_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + static int __xfs_dir2_free_read( struct xfs_trans *tp, @@ -83,7 +98,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_verify); + XFS_DATA_FORK, xfs_dir2_free_read_verify); } int diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index daf5d0fc6165..7ec61af8449f 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -72,7 +72,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0ba0f0992d6e..b38a10e6f2e0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,8 +360,8 @@ xfs_qm_dqalloc( return (error); } -void -xfs_dquot_read_verify( +static void +xfs_dquot_buf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -388,12 +388,26 @@ xfs_dquot_read_verify( error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, "xfs_dquot_read_verify"); if (error) { - XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", - XFS_ERRLEVEL_LOW, mp, d); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); xfs_buf_ioerror(bp, EFSCORRUPTED); break; } } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -413,7 +427,7 @@ xfs_qm_dqrepair( /* * Read the buffer without verification so we get the corrupted - * buffer returned to us. + * buffer returned to us. make sure we verify it on write, though. */ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, @@ -423,6 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } + (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -521,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_read_verify); + 0, &bp, xfs_dquot_buf_read_verify); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index a08ba92d7da0..5438d883b628 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_read_verify(struct xfs_buf *bp); +extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5bd255e5f7b8..070f41845572 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1473,7 +1473,7 @@ xfs_check_agi_unlinked( #endif static void -xfs_agi_read_verify( +xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -1502,6 +1502,21 @@ xfs_agi_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } xfs_check_agi_unlinked(agi); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); + bp->b_pre_io = xfs_agi_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 11306c6d61c7..15a79f8ca03c 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -183,7 +183,7 @@ xfs_inobt_key_diff( } void -xfs_inobt_read_verify( +xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -211,11 +211,24 @@ xfs_inobt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); + bp->b_pre_io = xfs_inobt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3a243d076950..910b2da01042 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -void +static void xfs_inode_buf_verify( struct xfs_buf *bp) { @@ -418,6 +418,21 @@ xfs_inode_buf_verify( } } xfs_inobp_check(mp, bp); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); + bp->b_pre_io = xfs_inode_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -447,7 +462,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1a892114792f..a322c19723a3 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,7 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_verify(struct xfs_buf *); +void xfs_inode_buf_read_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 0f18d412e3e8..7f86fdaab7ae 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bff18d73c610..c85da75e4a43 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -612,8 +612,8 @@ xfs_sb_to_disk( } } -void -xfs_sb_read_verify( +static void +xfs_sb_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -629,6 +629,21 @@ xfs_sb_read_verify( error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); if (error) xfs_buf_ioerror(bp, error); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); + bp->b_pre_io = xfs_sb_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a6dfb97490cc..bd40ae9624e5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_read_verify); + xfs_dquot_buf_read_verify); if (error) break; -- cgit v1.2.1 From b0f539de9fcc543a3ffa40bc22bf51aca6ea6183 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:53:49 +1100 Subject: xfs: connect up write verifiers to new buffers Metadata buffers that are read from disk have write verifiers already attached to them, but newly allocated buffers do not. Add appropriate write verifiers to all new metadata buffers. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 8 ++-- fs/xfs/xfs_alloc.h | 3 ++ fs/xfs/xfs_alloc_btree.c | 1 + fs/xfs/xfs_attr_leaf.c | 4 +- fs/xfs/xfs_bmap.c | 2 + fs/xfs/xfs_bmap_btree.c | 3 +- fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 1 + fs/xfs/xfs_btree.h | 2 + fs/xfs/xfs_da_btree.c | 3 ++ fs/xfs/xfs_dir2_block.c | 2 + fs/xfs/xfs_dir2_data.c | 11 +++-- fs/xfs/xfs_dir2_leaf.c | 19 +++++---- fs/xfs/xfs_dir2_node.c | 24 +++++++---- fs/xfs/xfs_dir2_priv.h | 2 + fs/xfs/xfs_dquot.c | 104 +++++++++++++++++++++++----------------------- fs/xfs/xfs_fsops.c | 8 +++- fs/xfs/xfs_ialloc.c | 5 ++- fs/xfs/xfs_ialloc.h | 4 +- fs/xfs/xfs_ialloc_btree.c | 1 + fs/xfs/xfs_inode.c | 14 ++++++- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 1 + 24 files changed, 137 insertions(+), 89 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d12bbedf6fe5..545a6c4c2366 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,14 +465,14 @@ xfs_agfl_verify( #endif } -static void +void xfs_agfl_write_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); } -void +static void xfs_agfl_read_verify( struct xfs_buf *bp) { @@ -2181,14 +2181,14 @@ xfs_agf_verify( } } -static void +void xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } -void +static void xfs_agf_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index feacb061bab7..f32811f50f43 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +void xfs_agf_write_verify(struct xfs_buf *bp); +void xfs_agfl_write_verify(struct xfs_buf *bp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 6e98b22ebde0..b83396524913 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -401,6 +401,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .read_verify = xfs_allocbt_read_verify, + .write_verify = xfs_allocbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 57729d71ab1a..5cd5b0c1d17a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -924,7 +924,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); + bp2->b_pre_io = bp1->b_pre_io; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +978,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - ASSERT(bp != NULL); + bp->b_pre_io = xfs_attr_leaf_write_verify; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 9ae7aba52e0f..6a0f3f9f39d3 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3124,6 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ + abp->b_pre_io = xfs_bmbt_write_verify; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3270,6 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp->b_pre_io = xfs_bmbt_write_verify; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 17d7423e7503..79758e1e4f74 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,7 +749,7 @@ xfs_bmbt_verify( } } -static void +void xfs_bmbt_write_verify( struct xfs_buf *bp) { @@ -806,6 +806,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .read_verify = xfs_bmbt_read_verify, + .write_verify = xfs_bmbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 1d00fbe9dd79..938c85986549 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -233,6 +233,7 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern void xfs_bmbt_read_verify(struct xfs_buf *bp); +extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ef1066078c33..1e2d89eed2a4 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -996,6 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; + (*bpp)->b_pre_io = cur->bc_ops->write_verify; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 3a4c314047a0..458ab3550898 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -189,6 +189,8 @@ struct xfs_btree_ops { union xfs_btree_key *key); void (*read_verify)(struct xfs_buf *bp); + void (*write_verify)(struct xfs_buf *bp); + #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6bb0a59eaaee..087950fc2eb7 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -193,6 +193,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_pre_io = xfs_da_node_write_verify; *bpp = bp; return(0); } @@ -392,6 +393,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, } memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_pre_io = blk1->bp->b_pre_io; blk1->bp = bp; blk1->blkno = blkno; diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 0f8793c74fe2..e2fdc6f03d8a 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1010,6 +1010,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ + dbp->b_pre_io = xfs_dir2_block_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1139,6 +1140,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } + bp->b_pre_io = xfs_dir2_block_write_verify; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index b555585f5ab6..dcb8a873ab92 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -void +static void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -202,14 +202,14 @@ xfs_dir2_data_verify( } } -static void +void xfs_dir2_data_write_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); } -void +static void xfs_dir2_data_read_verify( struct xfs_buf *bp) { @@ -482,10 +482,9 @@ xfs_dir2_data_init( */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); - if (error) { + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_pre_io = xfs_dir2_data_write_verify; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 5b3bcab2a656..3002ab7d54c3 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -81,7 +81,7 @@ xfs_dir2_leaf1_read_verify( xfs_buf_ioend(bp, 0); } -static void +void xfs_dir2_leafn_write_verify( struct xfs_buf *bp) { @@ -198,6 +198,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ + dbp->b_pre_io = xfs_dir2_data_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1243,15 +1244,14 @@ xfs_dir2_leaf_init( * Get the buffer for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); - leaf = bp->b_addr; + /* * Initialize the header. */ + leaf = bp->b_addr; leaf->hdr.info.magic = cpu_to_be16(magic); leaf->hdr.info.forw = 0; leaf->hdr.info.back = 0; @@ -1264,10 +1264,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { + bp->b_pre_io = xfs_dir2_leaf1_write_verify; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); - } + } else + bp->b_pre_io = xfs_dir2_leafn_write_verify; *bpp = bp; return 0; } @@ -1951,7 +1953,10 @@ xfs_dir2_node_to_leaf( xfs_dir2_leaf_compact(args, lbp); else xfs_dir2_leaf_log_header(tp, lbp); + + lbp->b_pre_io = xfs_dir2_leaf1_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* * Set up the leaf tail from the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index a58abe1fc0d0..da90a91f4420 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -197,11 +197,12 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; + free = fbp->b_addr; leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -223,7 +224,10 @@ xfs_dir2_leaf_to_node( *to = cpu_to_be16(off); } free->hdr.nused = cpu_to_be32(n); + + lbp->b_pre_io = xfs_dir2_leafn_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* * Log everything. */ @@ -632,6 +636,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -646,6 +651,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1638,12 +1644,12 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7ec61af8449f..01b82dcddc3e 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -45,6 +45,7 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif +extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,6 +74,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, /* xfs_dir2_leaf.c */ extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b38a10e6f2e0..1b06aa051074 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -248,7 +248,57 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} /* * Allocate a block and fill it with dquots. @@ -315,6 +365,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; + bp->b_pre_io = xfs_dquot_buf_write_verify; /* * Make a chunk of dquots out of this buffer and log @@ -359,59 +410,6 @@ xfs_qm_dqalloc( return (error); } - -static void -xfs_dquot_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; - xfs_dqid_t id = 0; - int i; - - /* - * On the first read of the buffer, verify that each dquot is valid. - * We don't know what the id of the dquot is supposed to be, just that - * they should be increasing monotonically within the buffer. If the - * first id is corrupt, then it will fail on the second dquot in the - * buffer so corruptions could point to the wrong dquot in this case. - */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; - - ddq = &d[i].dd_diskdq; - - if (i == 0) - id = be32_to_cpu(ddq->d_id); - - error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } - } -} - -static void -xfs_dquot_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); -} - -void -xfs_dquot_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} - STATIC int xfs_qm_dqrepair( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb65b067ed31..5d6d6b9d369d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -222,6 +222,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -259,6 +260,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -279,6 +281,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -450,9 +453,10 @@ xfs_growfs_data_private( bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); - if (bp) + if (bp) { xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - else + bp->b_pre_io = xfs_sb_write_verify; + } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 070f41845572..faf68600d3a6 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,6 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ + fbuf->b_pre_io = xfs_inode_buf_write_verify; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1504,14 +1505,14 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -static void +void xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } -void +static void xfs_agi_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 1fd6ea4e9c91..7a169e34e30e 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +void xfs_agi_write_verify(struct xfs_buf *bp); + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 15a79f8ca03c..7761e1ebeff7 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -271,6 +271,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .read_verify = xfs_inobt_read_verify, + .write_verify = xfs_inobt_write_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 910b2da01042..dfcbe73f1db4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,7 +420,7 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -static void +void xfs_inode_buf_write_verify( struct xfs_buf *bp) { @@ -1782,6 +1782,18 @@ xfs_ifree_cluster( if (!bp) return ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_pre_io = xfs_inode_buf_write_verify; + /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a322c19723a3..482214d120a7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -555,6 +555,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_inode_buf_read_verify(struct xfs_buf *); +void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c85da75e4a43..152a7fc843f9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,7 +631,7 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -static void +void xfs_sb_write_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index de9089acc610..29c1b3ac920e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -386,6 +386,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ extern void xfs_sb_read_verify(struct xfs_buf *); +extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -- cgit v1.2.1 From 1813dd64057490e7a0678a885c4fe6d02f78bdc1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:54:40 +1100 Subject: xfs: convert buffer verifiers to an ops structure. To separate the verifiers from iodone functions and associate read and write verifiers at the same time, introduce a buffer verifier operations structure to the xfs_buf. This avoids the need for assigning the write verifier, clearing the iodone function and re-running ioend processing in the read verifier, and gets rid of the nasty "b_pre_io" name for the write verifier function pointer. If we ever need to, it will also be easier to add further content specific callbacks to a buffer with an ops structure in place. We also avoid needing to export verifier functions, instead we can simply export the ops structures for those that are needed outside the function they are defined in. This patch also fixes a directory block readahead verifier issue it exposed. This patch also adds ops callbacks to the inode/alloc btree blocks initialised by growfs. These will need more work before they will work with CRCs. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 4 +++ fs/xfs/xfs_alloc.c | 28 ++++++++++++--------- fs/xfs/xfs_alloc.h | 4 +-- fs/xfs/xfs_alloc_btree.c | 18 ++++++++------ fs/xfs/xfs_alloc_btree.h | 2 ++ fs/xfs/xfs_attr_leaf.c | 19 +++++++------- fs/xfs/xfs_attr_leaf.h | 3 ++- fs/xfs/xfs_bmap.c | 22 ++++++++--------- fs/xfs/xfs_bmap_btree.c | 20 ++++++++------- fs/xfs/xfs_bmap_btree.h | 3 +-- fs/xfs/xfs_btree.c | 26 +++++++++---------- fs/xfs/xfs_btree.h | 9 +++---- fs/xfs/xfs_buf.c | 63 ++++++++++++++++++++++++++++------------------- fs/xfs/xfs_buf.h | 24 ++++++++++-------- fs/xfs/xfs_da_btree.c | 40 +++++++++++++++++++----------- fs/xfs/xfs_da_btree.h | 4 +-- fs/xfs/xfs_dir2_block.c | 20 ++++++++------- fs/xfs/xfs_dir2_data.c | 52 +++++++++++++++++++++++++++++++------- fs/xfs/xfs_dir2_leaf.c | 36 +++++++++++++++------------ fs/xfs/xfs_dir2_node.c | 26 ++++++++++--------- fs/xfs/xfs_dir2_priv.h | 10 +++++--- fs/xfs/xfs_dquot.c | 18 ++++++++------ fs/xfs/xfs_dquot.h | 3 ++- fs/xfs/xfs_fsops.c | 29 +++++++++++++--------- fs/xfs/xfs_ialloc.c | 18 ++++++++------ fs/xfs/xfs_ialloc.h | 2 +- fs/xfs/xfs_ialloc_btree.c | 17 +++++++------ fs/xfs/xfs_ialloc_btree.h | 2 ++ fs/xfs/xfs_inode.c | 22 ++++++++++------- fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 35 +++++++++++++++----------- fs/xfs/xfs_mount.h | 4 +-- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_trans.h | 6 ++--- fs/xfs/xfs_trans_buf.c | 8 +++--- 37 files changed, 357 insertions(+), 249 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 22bd4db011c8..f2aeedb6a579 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 545a6c4c2366..393055fe3aef 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,7 +465,7 @@ xfs_agfl_verify( #endif } -void +static void xfs_agfl_write_verify( struct xfs_buf *bp) { @@ -477,11 +477,13 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); - bp->b_pre_io = xfs_agfl_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -499,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2181,23 +2183,25 @@ xfs_agf_verify( } } -void -xfs_agf_write_verify( +static void +xfs_agf_read_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } static void -xfs_agf_read_verify( +xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); - bp->b_pre_io = xfs_agf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2215,7 +2219,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index f32811f50f43..99d0a6101558 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -void xfs_agf_write_verify(struct xfs_buf *bp); -void xfs_agfl_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b83396524913..b1ddef6b2689 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -329,22 +329,25 @@ xfs_allocbt_verify( } static void -xfs_allocbt_write_verify( +xfs_allocbt_read_verify( struct xfs_buf *bp) { xfs_allocbt_verify(bp); } -void -xfs_allocbt_read_verify( +static void +xfs_allocbt_write_verify( struct xfs_buf *bp) { xfs_allocbt_verify(bp); - bp->b_pre_io = xfs_allocbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -400,8 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, - .read_verify = xfs_allocbt_read_verify, - .write_verify = xfs_allocbt_write_verify, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 359fb86ed876..7e89a2b429dd 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 5cd5b0c1d17a..ee24993c7d12 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -104,22 +104,23 @@ xfs_attr_leaf_verify( } static void -xfs_attr_leaf_write_verify( +xfs_attr_leaf_read_verify( struct xfs_buf *bp) { xfs_attr_leaf_verify(bp); } -void -xfs_attr_leaf_read_verify( +static void +xfs_attr_leaf_write_verify( struct xfs_buf *bp) { xfs_attr_leaf_verify(bp); - bp->b_pre_io = xfs_attr_leaf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { + .verify_read = xfs_attr_leaf_read_verify, + .verify_write = xfs_attr_leaf_write_verify, +}; int xfs_attr_leaf_read( @@ -130,7 +131,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_read_verify); + XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); } /*======================================================================== @@ -924,7 +925,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - bp2->b_pre_io = bp1->b_pre_io; + bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +979,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - bp->b_pre_io = xfs_attr_leaf_write_verify; + bp->b_ops = &xfs_attr_leaf_buf_ops; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 3bbf6277e43c..77de139a58f0 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,7 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_read_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 6a0f3f9f39d3..0e92d12765d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2663,7 +2663,7 @@ xfs_bmap_btree_to_extents( return error; #endif error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); @@ -3124,7 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_pre_io = xfs_bmbt_write_verify; + abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3271,7 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_pre_io = xfs_bmbt_write_verify; + bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); @@ -4082,7 +4082,7 @@ xfs_bmap_read_extents( */ while (level-- > 0) { error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -4129,7 +4129,7 @@ xfs_bmap_read_extents( nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) xfs_btree_reada_bufl(mp, nextbno, 1, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); /* * Copy records into the extent records. */ @@ -4162,7 +4162,7 @@ xfs_bmap_read_extents( if (bno == NULLFSBLOCK) break; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -5880,7 +5880,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -5966,7 +5966,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -6061,7 +6061,7 @@ xfs_bmap_count_tree( int numrecs; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6073,7 +6073,7 @@ xfs_bmap_count_tree( while (nextbno != NULLFSBLOCK) { error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6105,7 +6105,7 @@ xfs_bmap_count_tree( bno = nextbno; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 79758e1e4f74..061b45cbe614 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,23 +749,26 @@ xfs_bmbt_verify( } } -void -xfs_bmbt_write_verify( +static void +xfs_bmbt_read_verify( struct xfs_buf *bp) { xfs_bmbt_verify(bp); } -void -xfs_bmbt_read_verify( +static void +xfs_bmbt_write_verify( struct xfs_buf *bp) { xfs_bmbt_verify(bp); - bp->b_pre_io = xfs_bmbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -805,8 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, - .read_verify = xfs_bmbt_read_verify, - .write_verify = xfs_bmbt_write_verify, + .buf_ops = &xfs_bmbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 938c85986549..88469ca08696 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,11 +232,10 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); -extern void xfs_bmbt_read_verify(struct xfs_buf *bp); -extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1e2d89eed2a4..db010408d701 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -271,7 +271,7 @@ xfs_btree_dup_cursor( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -621,7 +621,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ @@ -630,7 +630,7 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, verify); + mp->m_bsize, lock, &bp, ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -650,13 +650,13 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -670,14 +670,14 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } STATIC int @@ -692,13 +692,13 @@ xfs_btree_readahead_lblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } @@ -718,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1, cur->bc_ops->read_verify); + left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1, cur->bc_ops->read_verify); + right, 1, cur->bc_ops->buf_ops); rval++; } @@ -996,7 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; - (*bpp)->b_pre_io = cur->bc_ops->write_verify; + (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } @@ -1024,7 +1024,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags, bpp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 458ab3550898..f932897194eb 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,8 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); - void (*read_verify)(struct xfs_buf *bp); - void (*write_verify)(struct xfs_buf *bp); + const struct xfs_buf_ops *buf_ops; #ifdef DEBUG /* check that k1 is lower than k2 */ @@ -359,7 +358,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -370,7 +369,7 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -382,7 +381,7 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bd1a948ee39c..26673a0b20e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -571,7 +571,7 @@ found: ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; - bp->b_pre_io = NULL; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -657,7 +657,7 @@ xfs_buf_read_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -669,7 +669,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); - bp->b_iodone = verify; + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -696,13 +696,13 @@ xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -715,7 +715,7 @@ xfs_buf_read_uncached( xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -728,7 +728,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); xfs_buf_iowait(bp); @@ -1001,27 +1001,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -1029,6 +1039,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1316,6 +1327,20 @@ _xfs_buf_ioapply( rw |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; } else { @@ -1325,20 +1350,6 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; - /* - * run the pre-io callback function if it exists. If this function - * fails it will mark the buffer with an error and the IO should - * not be dispatched. - */ - if (bp->b_pre_io) { - bp->b_pre_io(bp); - if (bp->b_error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_CORRUPT_INCORE); - return; - } - } - /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 51bc16a1cd9c..23f5642480bb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -111,6 +111,11 @@ struct xfs_buf_map { #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -154,9 +159,7 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ - - void (*b_pre_io)(struct xfs_buf *); - /* pre-io callback function */ + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; @@ -199,10 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, xfs_buf_iodone_t verify); + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline struct xfs_buf * xfs_buf_get( @@ -221,10 +225,10 @@ xfs_buf_read( xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, verify); + return xfs_buf_read_map(target, &map, 1, flags, ops); } static inline void @@ -232,10 +236,10 @@ xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1, verify); + return xfs_buf_readahead_map(target, &map, 1, ops); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -246,7 +250,7 @@ struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 087950fc2eb7..4d7696a02418 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -117,6 +117,12 @@ xfs_da_node_write_verify( xfs_da_node_verify(bp); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void xfs_da_node_read_verify( struct xfs_buf *bp) @@ -129,10 +135,12 @@ xfs_da_node_read_verify( xfs_da_node_verify(bp); break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_read_verify(bp); + bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_read_verify(bp); + bp->b_ops = &xfs_dir2_leafn_buf_ops; + bp->b_ops->verify_read(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, @@ -140,12 +148,14 @@ xfs_da_node_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - - bp->b_pre_io = xfs_da_node_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_da_node_buf_ops = { + .verify_read = xfs_da_node_read_verify, + .verify_write = xfs_da_node_write_verify, +}; + + int xfs_da_node_read( struct xfs_trans *tp, @@ -156,7 +166,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_read_verify); + which_fork, &xfs_da_node_buf_ops); } /*======================================================================== @@ -193,7 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); - bp->b_pre_io = xfs_da_node_write_verify; + bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -394,7 +404,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_pre_io = blk1->bp->b_pre_io; + bp->b_ops = blk1->bp->b_ops; blk1->bp = bp; blk1->blkno = blkno; @@ -828,11 +838,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have - * to update the pre_io pointer as well to match the buffer type change + * to update the b_ops pointer as well to match the buffer type change * that could occur. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); - root_blk->bp->b_pre_io = bp->b_pre_io; + root_blk->bp->b_ops = bp->b_ops; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -2223,7 +2233,7 @@ xfs_da_read_buf( xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2245,7 +2255,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, verifier); + mapp, nmap, 0, &bp, ops); if (error) goto out_free; @@ -2303,7 +2313,7 @@ xfs_da_reada_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf_map map; struct xfs_buf_map *mapp; @@ -2322,7 +2332,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 521b008445ab..ee5170c46ae1 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -229,10 +229,10 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier); + const struct xfs_buf_ops *ops); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, xfs_buf_iodone_t verifier); + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e2fdc6f03d8a..7536faaa61e7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -74,22 +74,24 @@ xfs_dir2_block_verify( } static void -xfs_dir2_block_write_verify( +xfs_dir2_block_read_verify( struct xfs_buf *bp) { xfs_dir2_block_verify(bp); } -void -xfs_dir2_block_read_verify( +static void +xfs_dir2_block_write_verify( struct xfs_buf *bp) { xfs_dir2_block_verify(bp); - bp->b_pre_io = xfs_dir2_block_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dir2_block_buf_ops = { + .verify_read = xfs_dir2_block_read_verify, + .verify_write = xfs_dir2_block_write_verify, +}; + static int xfs_dir2_block_read( struct xfs_trans *tp, @@ -99,7 +101,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_read_verify); + XFS_DATA_FORK, &xfs_dir2_block_buf_ops); } static void @@ -1010,7 +1012,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_pre_io = xfs_dir2_block_write_verify; + dbp->b_ops = &xfs_dir2_block_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1140,7 +1142,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - bp->b_pre_io = xfs_dir2_block_write_verify; + bp->b_ops = &xfs_dir2_block_buf_ops; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index dcb8a873ab92..ffcf1774152e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -202,23 +202,57 @@ xfs_dir2_data_verify( } } -void -xfs_dir2_data_write_verify( +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir2_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + bp->b_ops = &xfs_dir2_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + xfs_dir2_data_verify(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +static void +xfs_dir2_data_read_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); } static void -xfs_dir2_data_read_verify( +xfs_dir2_data_write_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); - bp->b_pre_io = xfs_dir2_data_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dir2_data_buf_ops = { + .verify_read = xfs_dir2_data_read_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { + .verify_read = xfs_dir2_data_reada_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + int xfs_dir2_data_read( @@ -229,7 +263,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_buf_ops); } int @@ -240,7 +274,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); } /* @@ -484,7 +518,7 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_pre_io = xfs_dir2_data_write_verify; + bp->b_ops = &xfs_dir2_data_buf_ops; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 3002ab7d54c3..60cd2fa4e047 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -65,39 +65,43 @@ xfs_dir2_leaf_verify( } static void -xfs_dir2_leaf1_write_verify( +xfs_dir2_leaf1_read_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir2_leaf1_write_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - bp->b_pre_io = xfs_dir2_leaf1_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } void -xfs_dir2_leafn_write_verify( +xfs_dir2_leafn_read_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void -xfs_dir2_leafn_read_verify( +xfs_dir2_leafn_write_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - bp->b_pre_io = xfs_dir2_leafn_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { + .verify_read = xfs_dir2_leaf1_read_verify, + .verify_write = xfs_dir2_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { + .verify_read = xfs_dir2_leafn_read_verify, + .verify_write = xfs_dir2_leafn_write_verify, +}; + static int xfs_dir2_leaf_read( struct xfs_trans *tp, @@ -107,7 +111,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); + XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); } int @@ -119,7 +123,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_read_verify); + XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); } /* @@ -198,7 +202,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_pre_io = xfs_dir2_data_write_verify; + dbp->b_ops = &xfs_dir2_data_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1264,12 +1268,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_pre_io = xfs_dir2_leaf1_write_verify; + bp->b_ops = &xfs_dir2_leaf1_buf_ops; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); } else - bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_ops = &xfs_dir2_leafn_buf_ops; *bpp = bp; return 0; } @@ -1954,7 +1958,7 @@ xfs_dir2_node_to_leaf( else xfs_dir2_leaf_log_header(tp, lbp); - lbp->b_pre_io = xfs_dir2_leaf1_write_verify; + lbp->b_ops = &xfs_dir2_leaf1_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index da90a91f4420..5980f9b7fa9b 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -72,22 +72,24 @@ xfs_dir2_free_verify( } static void -xfs_dir2_free_write_verify( +xfs_dir2_free_read_verify( struct xfs_buf *bp) { xfs_dir2_free_verify(bp); } -void -xfs_dir2_free_read_verify( +static void +xfs_dir2_free_write_verify( struct xfs_buf *bp) { xfs_dir2_free_verify(bp); - bp->b_pre_io = xfs_dir2_free_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { + .verify_read = xfs_dir2_free_read_verify, + .verify_write = xfs_dir2_free_write_verify, +}; + static int __xfs_dir2_free_read( @@ -98,7 +100,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_read_verify); + XFS_DATA_FORK, &xfs_dir2_free_buf_ops); } int @@ -201,7 +203,7 @@ xfs_dir2_leaf_to_node( XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; leaf = lbp->b_addr; @@ -225,7 +227,7 @@ xfs_dir2_leaf_to_node( } free->hdr.nused = cpu_to_be32(n); - lbp->b_pre_io = xfs_dir2_leafn_write_verify; + lbp->b_ops = &xfs_dir2_leafn_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* @@ -636,7 +638,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -651,7 +653,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1649,7 +1651,7 @@ xfs_dir2_node_addname_int( -1, &fbp, XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 01b82dcddc3e..7da79f6515fd 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ +extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; + extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, xfs_off_t *offset, filldir_t filldir); @@ -45,7 +47,9 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif -extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; + extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,8 +77,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); -extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; + extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1b06aa051074..9e1bf5294c91 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -284,22 +284,24 @@ xfs_dquot_buf_verify( } static void -xfs_dquot_buf_write_verify( +xfs_dquot_buf_read_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); } void -xfs_dquot_buf_read_verify( +xfs_dquot_buf_write_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + /* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. @@ -365,7 +367,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; - bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log @@ -435,7 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } - (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; + (*bpp)->b_ops = &xfs_dquot_buf_ops; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -534,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_buf_read_verify); + 0, &bp, &xfs_dquot_buf_ops); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 5438d883b628..c694a8469c4a 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -162,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5d6d6b9d369d..94eaeedc5498 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -119,7 +119,8 @@ xfs_growfs_get_hdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags) + int flags, + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -130,6 +131,7 @@ xfs_growfs_get_hdr_buf( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; return bp; } @@ -217,12 +219,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -255,12 +257,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -276,12 +278,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -306,7 +308,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; @@ -329,7 +332,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -352,7 +356,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -448,14 +453,14 @@ xfs_growfs_data_private( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, - xfs_sb_read_verify); + &xfs_sb_buf_ops); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); if (bp) { + bp->b_ops = &xfs_sb_buf_ops; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_pre_io = xfs_sb_write_verify; } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index faf68600d3a6..2d6495eaaa34 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,7 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ - fbuf->b_pre_io = xfs_inode_buf_write_verify; + fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1505,23 +1505,25 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -void -xfs_agi_write_verify( +static void +xfs_agi_read_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } static void -xfs_agi_read_verify( +xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); - bp->b_pre_io = xfs_agi_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ @@ -1538,7 +1540,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7a169e34e30e..c8da3df271e6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,6 +150,6 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); -void xfs_agi_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 7761e1ebeff7..bec344b36507 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -217,22 +217,24 @@ xfs_inobt_verify( } static void -xfs_inobt_write_verify( +xfs_inobt_read_verify( struct xfs_buf *bp) { xfs_inobt_verify(bp); } -void -xfs_inobt_read_verify( +static void +xfs_inobt_write_verify( struct xfs_buf *bp) { xfs_inobt_verify(bp); - bp->b_pre_io = xfs_inobt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -270,8 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .read_verify = xfs_inobt_read_verify, - .write_verify = xfs_inobt_write_verify, + .buf_ops = &xfs_inobt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f782ad0c4769..25c0239a8eab 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_inobt_buf_ops; + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dfcbe73f1db4..66282dcb821b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,23 +420,27 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -void -xfs_inode_buf_write_verify( + +static void +xfs_inode_buf_read_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp); } -void -xfs_inode_buf_read_verify( +static void +xfs_inode_buf_write_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp); - bp->b_pre_io = xfs_inode_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -462,7 +466,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); @@ -1792,7 +1796,7 @@ xfs_ifree_cluster( * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_pre_io = xfs_inode_buf_write_verify; + bp->b_ops = &xfs_inode_buf_ops; /* * Walk the inodes already attached to the buffer and mark them diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 482214d120a7..22baf6ea4fac 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,8 +554,6 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_read_verify(struct xfs_buf *); -void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -600,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7f86fdaab7ae..2ea7d402188d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 924a4bc3d49a..931e8e23f192 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3699,7 +3699,7 @@ xlog_do_recover( ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - bp->b_iodone = xfs_sb_read_verify; + bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 152a7fc843f9..da508463ff10 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,21 +631,11 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -void -xfs_sb_write_verify( - struct xfs_buf *bp) -{ - xfs_sb_verify(bp); -} - -void +static void xfs_sb_read_verify( struct xfs_buf *bp) { xfs_sb_verify(bp); - bp->b_pre_io = xfs_sb_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } /* @@ -654,7 +644,7 @@ xfs_sb_read_verify( * If we find an XFS superblock, the run a normal, noisy mount because we are * really going to mount it and want to know about errors. */ -void +static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { @@ -671,6 +661,23 @@ xfs_sb_quiet_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + /* * xfs_readsb * @@ -697,8 +704,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 0, - loud ? xfs_sb_read_verify - : xfs_sb_quiet_read_verify); + loud ? &xfs_sb_buf_ops + : &xfs_sb_quiet_buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 29c1b3ac920e..bab8314507e4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,12 +385,12 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ -extern void xfs_sb_read_verify(struct xfs_buf *); -extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern const struct xfs_buf_ops xfs_sb_buf_ops; + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index bd40ae9624e5..e6a0af0ba007 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_buf_read_verify); + &xfs_dquot_buf_ops); if (error) break; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f02d40296506..c6c0601abd7a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -474,7 +474,7 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline int xfs_trans_read_buf( @@ -485,11 +485,11 @@ xfs_trans_read_buf( int numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_read_buf_map(mp, tp, target, &map, 1, - flags, bpp, verify); + flags, bpp, ops); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 977628207b45..4fc17d479d42 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -258,7 +258,7 @@ xfs_trans_read_buf_map( int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -266,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -315,7 +315,7 @@ xfs_trans_read_buf_map( ASSERT(!XFS_BUF_ISASYNC(bp)); ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -352,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? -- cgit v1.2.1 From f3b59291a69d0b734be1fc8be489fef2dd846d3d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 15 Nov 2012 23:08:57 -0500 Subject: ext4: remove calls to ext4_jbd2_file_inode() from delalloc write path The calls to ext4_jbd2_file_inode() are needed to guarantee that we do not expose stale data in the data=ordered mode. However, they are not necessary because in all of the cases where we have newly allocated blocks in the delayed allocation write path, we immediately submit the dirty pages for I/O. Hence, we can avoid the overhead of adding the inode to the list of inodes whose data pages will be to be flushed out to disk completely during the next commit operation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 52f7ff2f2e7e..cf5d30a7cce3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1636,15 +1636,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) { - /* Only if the journal is aborted */ - mpd->retval = err; - goto submit_io; - } - } } /* @@ -2592,17 +2583,8 @@ static int ext4_da_write_end(struct file *file, if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - + if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - } up_write(&EXT4_I(inode)->i_data_sem); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size -- cgit v1.2.1 From b7804161a3a3077c568078dfaa4ee4ffc8817f65 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 16 Nov 2012 09:04:16 -0500 Subject: GFS2: don't reference inode's glock during block allocation trace This patch changes the block allocation trace so that it references the rgd's glock rather than the inode's glock. Now that the order of inode creation is switched, this prevents a reference to the glock which may not be set yet. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/trace_gfs2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index bbdc78af60ca..2ee13e841e9f 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc, ), TP_fast_assign( - __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev; __entry->start = block; __entry->inum = ip->i_no_addr; __entry->len = len; -- cgit v1.2.1 From be4f245dbbbc1f37370ab463cd4892acf4a1222b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 16 Nov 2012 09:11:39 -0500 Subject: GFS2: add error check while allocating new inodes This patch adds a return code check after attempting to allocate a new inode during dinode creation. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e321333f0b4c..2405695febe9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -674,6 +674,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; inode = new_inode(sdp->sd_vfs); + if (!inode) { + gfs2_glock_dq_uninit(ghs); + return -ENOMEM; + } ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) -- cgit v1.2.1 From da8c66638ae684c99abcb30e89d2803402e7ca20 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 15 Nov 2012 15:01:51 -0600 Subject: dlm: fix lvb invalidation conditions When a node is removed that held a PW/EX lock, the existing master node should invalidate the lvb on the resource due to the purged lock. Previously, the existing master node was invalidating the lvb if it found only NL/CR locks on the resource during recovery for the removed node. This could lead to cases where it invalidated the lvb and shouldn't have, or cases where it should have invalidated and didn't. When recovery selects a *new* master node for a resource, and that new master finds only NL/CR locks on the resource after lock recovery, it should invalidate the lvb. This case was handled correctly (but was incorrectly applied to the existing master case also.) When a process exits while holding a PW/EX lock, the lvb on the resource should be invalidated. This was not happening. The lvb contents and VALNOTVALID flag should be recovered before granting locks in recovery so that the recovered lvb state is provided in the callback. The lvb was being recovered after the lock was granted. Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 1 + fs/dlm/lock.c | 16 +++++++++++++--- fs/dlm/recover.c | 37 ++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 871c1abf6029..77c0f70f8fe8 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -337,6 +337,7 @@ enum rsb_flags { RSB_NEW_MASTER2, RSB_RECOVER_CONVERT, RSB_RECOVER_GRANT, + RSB_RECOVER_LVB_INVAL, }; static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b56950758188..a579f30f237d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, if ((lkb->lkb_nodeid == nodeid_gone) || dlm_is_removed(ls, lkb->lkb_nodeid)) { + /* tell recover_lvb to invalidate the lvb + because a node holding EX/PW failed */ + if ((lkb->lkb_exflags & DLM_LKF_VALBLK) && + (lkb->lkb_grmode >= DLM_LOCK_PW)) { + rsb_set_flag(r, RSB_RECOVER_LVB_INVAL); + } + del_lkb(r, lkb); /* this put should free the lkb */ @@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) return error; } -/* The force flag allows the unlock to go ahead even if the lkb isn't granted. - Regardless of what rsb queue the lock is on, it's removed and freed. */ +/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't + granted. Regardless of what rsb queue the lock is on, it's removed and + freed. The IVVALBLK flag causes the lvb on the resource to be invalidated + if our lock is PW/EX (it's ignored if our granted mode is smaller.) */ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) { struct dlm_args args; int error; - set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); + set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, + lkb->lkb_ua, &args); error = unlock_lock(ls, lkb, &args); if (error == -DLM_EUNLOCK) diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 4a7a76e42fc3..aedea28a86a1 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r) * the VALNOTVALID flag if necessary, and determining the correct lvb contents * based on the lvb's of the locks held on the rsb. * - * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it - * was already set prior to recovery, it's not cleared, regardless of locks. + * RSB_VALNOTVALID is set in two cases: + * + * 1. we are master, but not new, and we purged an EX/PW lock held by a + * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL) + * + * 2. we are a new master, and there are only NL/CR locks left. + * (We could probably improve this by only invaliding in this way when + * the previous master left uncleanly. VMS docs mention that.) * * The LVB contents are only considered for changing when this is a new master * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with @@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r) int big_lock_exists = 0; int lvblen = r->res_ls->ls_lvblen; + if (!rsb_flag(r, RSB_NEW_MASTER2) && + rsb_flag(r, RSB_RECOVER_LVB_INVAL)) { + /* case 1 above */ + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!rsb_flag(r, RSB_NEW_MASTER2)) + return; + + /* we are the new master, so figure out if VALNOTVALID should + be set, and set the rsb lvb from the best lkb available. */ + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) continue; @@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r) if (!lock_lvb_exists) goto out; + /* lvb is invalidated if only NL/CR locks remain */ if (!big_lock_exists) rsb_set_flag(r, RSB_VALNOTVALID); - /* don't mess with the lvb unless we're the new master */ - if (!rsb_flag(r, RSB_NEW_MASTER2)) - goto out; - if (!r->res_lvbptr) { r->res_lvbptr = dlm_allocate_lvb(r->res_ls); if (!r->res_lvbptr) @@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls) if (is_master(r)) { if (rsb_flag(r, RSB_RECOVER_CONVERT)) recover_conversion(r); + + /* recover lvb before granting locks so the updated + lvb/VALNOTVALID is presented in the completion */ + recover_lvb(r); + if (rsb_flag(r, RSB_NEW_MASTER2)) recover_grant(r); - recover_lvb(r); count++; + } else { + rsb_clear_flag(r, RSB_VALNOTVALID); } rsb_clear_flag(r, RSB_RECOVER_CONVERT); + rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL); rsb_clear_flag(r, RSB_NEW_MASTER2); unlock_rsb(r); } -- cgit v1.2.1 From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Nov 2012 17:53:04 -0800 Subject: mm, oom: reintroduce /proc/pid/oom_adj This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj") from Davidlohr Bueso. It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier kernels. It simply scales the value linearly when /proc/pid/oom_score_adj is written. The major difference is that its scheduled removal is no longer included in Documentation/feature-removal-schedule.txt. We do warn users with a single printk, though, to suggest the more powerful and supported /proc/pid/oom_score_adj interface. Reported-by: Artem S. Tashkinov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..3c231adf8450 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), -- cgit v1.2.1 From 53f21a8ea1d76a002103ce20abd168fe83b20ee7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 17 Oct 2012 09:39:49 +0200 Subject: pstore/ram: Fixup section annotations The compiler complained about missing section annotations. Fix it. Signed-off-by: Hannes Reinecke Cc: Colin Cross Cc: Tony Luck Acked-by: Kees Cook Signed-off-by: Anton Vorontsov --- fs/pstore/ram.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 1a4f6da58eab..2b6ebbca3521 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -287,8 +287,9 @@ static void ramoops_free_przs(struct ramoops_context *cxt) kfree(cxt->przs); } -static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int __devinit ramoops_init_przs(struct device *dev, + struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) { int err = -ENOMEM; int i; @@ -326,9 +327,10 @@ fail_prz: return err; } -static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, - struct persistent_ram_zone **prz, - phys_addr_t *paddr, size_t sz, u32 sig) +static int __devinit ramoops_init_prz(struct device *dev, + struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) { if (!sz) return 0; -- cgit v1.2.1 From 42e2976f131d65555d5c1d6c3d47facc63577814 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:44 +1100 Subject: xfs: fix attr tree double split corruption In certain circumstances, a double split of an attribute tree is needed to insert or replace an attribute. In rare situations, this can go wrong, leaving the attribute tree corrupted. In this case, the attr being replaced is the last attr in a leaf node, and the replacement is larger so doesn't fit in the same leaf node. When we have the initial condition of a node format attribute btree with two leaves at index 1 and 2. Call them L1 and L2. The leaf L1 is completely full, there is not a single byte of free space in it. L2 is mostly empty. The attribute being replaced - call it X - is the last attribute in L1. The way an attribute replace is executed is that the replacement attribute - call it Y - is first inserted into the tree, but has an INCOMPLETE flag set on it so that list traversals ignore it. Once this transaction is committed, a second transaction it run to atomically mark Y as COMPLETE and X as INCOMPLETE, so that a traversal will now find Y and skip X. Once that transaction is committed, attribute X is then removed. So, the initial condition is: +--------+ +--------+ | L1 | | L2 | | fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 | | fsp: 0 | | fsp: N | |--------| |--------| | attr A | | attr 1 | |--------| |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr X | | attr n | +--------+ +--------+ So now we go to replace X, and see that L1:fsp = 0 - it is full so we can't insert Y in the same leaf. So we record the the location of attribute X so we can track it for later use, then we split L1 into L1 and L3 and reblance across the two leafs. We end with: +--------+ +--------+ +--------+ | L1 | | L3 | | L2 | | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: N | |--------| |--------| |--------| | attr A | | attr X | | attr 1 | |--------| +--------+ |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And we track that the original attribute is now at L3:0. We then try to insert Y into L1 again, and find that there isn't enough room because the new attribute is larger than the old one. Hence we have to split again to make room for Y. We end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| + INCOMP + +--------+ |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And now we have the new (incomplete) attribute @ L4:0, and the original attribute at L3:0. At this point, the first transaction is committed, and we move to the flipping of the flags. This is where we are supposed to end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| +--------+ + INCOMP + |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ But that doesn't happen properly - the attribute tracking indexes are not pointing to the right locations. What we end up with is both the old attribute to be removed pointing at L4:0 and the new attribute at L4:1. On a debug kernel, this assert fails like so: XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725 because the new attribute location does not exist. On a production kernel, this goes unnoticed and the code proceeds ahead merrily and removes L4 because it thinks that is the block that is no longer needed. This leaves the hash index node pointing to entries L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free space, and so everything is busted. This corruption is caused by the removal of the old attribute triggering a join - it joins everything correctly but then frees the wrong block. xfs_repair will report something like: bad sibling back pointer for block 4 in attribute fork for inode 131 problem with attribute contents in inode 131 would clear attr fork bad nblocks 8 for inode 131, would reset to 3 bad anextents 4 for inode 131, would reset to 0 The problem lies in the assignment of the old/new blocks for tracking purposes when the double leaf split occurs. The first split tries to place the new attribute inside the current leaf (i.e. "inleaf == true") and moves the old attribute (X) to the new block. This sets up the old block/index to L1:X, and newly allocated block to L3:0. It then moves attr X to the new block and tries to insert attr Y at the old index. That fails, so it splits again. With the second split, the rebalance ends up placing the new attr in the second new block - L4:0 - and this is where the code goes wrong. What is does is it sets both the new and old block index to the second new block. Hence it inserts attr Y at the right place (L4:0) but overwrites the current location of the attr to replace that is held in the new block index (currently L3:0). It over writes it with L4:1 - the index we later assert fail on. Hopefully this table will show this in a foramt that is a bit easier to understand: Split old attr index new attr index vanilla patched vanilla patched before 1st L1:26 L1:26 N/A N/A after 1st L3:0 L3:0 L1:26 L1:26 after 2nd L4:0 L3:0 L4:1 L4:0 ^^^^ ^^^^ wrong wrong The fix is surprisingly simple, for all this analysis - just stop the rebalance on the out-of leaf case from overwriting the new attr index - it's already correct for the double split case. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca738..70eec1829776 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); -- cgit v1.2.1 From 3daed8bc3e49b9695ae931b9f472b5b90d1965b3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..e57e2daa357c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; -- cgit v1.2.1 From d69043c42d8c6414fa28ad18d99973aa6c1c2e24 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..4b0b8dd1b7b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } -- cgit v1.2.1 From b042e47491ba5f487601b5141a3f1d8582304170 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 22 Oct 2012 11:19:28 +0200 Subject: pstore/ram: Fix undefined usage of rounddown_pow_of_two(0) record_size / console_size / ftrace_size can be 0 (this is how you disable the feature), but rounddown_pow_of_two(0) is undefined. As suggested by Kees Cook, use !is_power_of_2() as a condition to call rounddown_pow_of_two and avoid its undefined behavior on the value 0. This issue has been present since commit 1894a253 (ramoops: Move to fs/pstore/ram.c). Cc: stable@vger.kernel.org Signed-off-by: Maxime Bizon Signed-off-by: Florian Fainelli Acked-by: Kees Cook Signed-off-by: Anton Vorontsov --- fs/pstore/ram.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2b6ebbca3521..8741cea6253c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -376,10 +376,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev) goto fail_out; } - pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); - pdata->record_size = rounddown_pow_of_two(pdata->record_size); - pdata->console_size = rounddown_pow_of_two(pdata->console_size); - pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); + if (!is_power_of_2(pdata->mem_size)) + pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); + if (!is_power_of_2(pdata->record_size)) + pdata->record_size = rounddown_pow_of_two(pdata->record_size); + if (!is_power_of_2(pdata->console_size)) + pdata->console_size = rounddown_pow_of_two(pdata->console_size); + if (!is_power_of_2(pdata->ftrace_size)) + pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); cxt->dump_read_cnt = 0; cxt->size = pdata->mem_size; -- cgit v1.2.1 From 2cbba75a56ea78e6876b4e2547a882f10b3fe72b Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Mon, 5 Nov 2012 22:40:14 +0400 Subject: jffs2: hold erase_completion_lock on exit Users of jffs2_do_reserve_space() expect they still held erase_completion_lock after call to it. But there is a path where jffs2_do_reserve_space() leaves erase_completion_lock unlocked. The patch fixes it. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Cc: stable@vger.kernel.org Signed-off-by: Artem Bityutskiy --- fs/jffs2/nodemgmt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 0c96eb52c797..03310721712f 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -417,14 +417,16 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_unlock(&c->erase_completion_lock); ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); - if (ret) - return ret; + /* Just lock it again and continue. Nothing much can change because we hold c->alloc_sem anyway. In fact, it's not entirely clear why we hold c->erase_completion_lock in the majority of this function... but that's a question for another (more caffeine-rich) day. */ spin_lock(&c->erase_completion_lock); + if (ret) + return ret; + waste = jeb->free_size; jffs2_link_node_ref(c, jeb, (jeb->offset + c->sector_size - waste) | REF_OBSOLETE, -- cgit v1.2.1 From 3587b1b097d70c2eb9fee95ea7995d13c05f66e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 18 Nov 2012 19:19:00 +0000 Subject: fanotify: fix FAN_Q_OVERFLOW case of fanotify_read() If the FAN_Q_OVERFLOW bit set in event->mask, the fanotify event metadata will not contain a valid file descriptor, but copy_event_to_user() didn't check for that, and unconditionally does a fd_install() on the file descriptor. Which in turn will cause a BUG_ON() in __fd_install(). Introduced by commit 352e3b249284 ("fanotify: sanitize failure exits in copy_event_to_user()") Mea culpa - missed that path ;-/ Reported-by: Alex Shi Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 721d692fa8d4..6fcaeb8c902e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (ret) goto out_close_fd; - fd_install(fd, f); + if (fd != FAN_NOFD) + fd_install(fd, f); return fanotify_event_metadata.event_len; out_close_fd: -- cgit v1.2.1 From 73f7ef435934e952c1d70d83d69921ea5d1f6bd4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Nov 2012 03:02:58 +0000 Subject: sysctl: Pass useful parameters to sysctl permissions - Current is implicitly avaiable so passing current->nsproxy isn't useful. - The ctl_table_header is needed to find how the sysctl table is connected to the rest of sysctl. - ctl_table_root is avaiable in the ctl_table_header so no need to it. With these changes it becomes possible to write a version of net_sysctl_permission that takes into account the network namespace of the sysctl table, an important feature in extending the user namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- fs/proc/proc_sysctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a781bdf06694..701580ddfcc3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -378,12 +378,13 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) { + struct ctl_table_root *root = head->root; int mode; if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); + mode = root->permissions(head, table); else mode = table->mode; @@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, * and won't be until we finish. */ error = -EPERM; - if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ @@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask) if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; -- cgit v1.2.1 From e656d8a6f7fdf7612d2f5771f0ddfca9487f59d9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 14:52:49 -0700 Subject: procfs: Use the proc generic infrastructure for proc/self. I had visions at one point of splitting proc into two filesystems. If that had happened proc/self being the the part of proc that actually deals with pids would have been a nice cleanup. As it is proc/self requires a lot of unnecessary infrastructure for a single file. The only user visible change is that a mounted /proc for a pid namespace that is dead now shows a broken proc symlink, instead of being completely invisible. I don't think anyone will notice or care. Signed-off-by: Eric W. Biederman --- fs/proc/Makefile | 1 + fs/proc/base.c | 154 +---------------------------------------------------- fs/proc/internal.h | 1 + fs/proc/root.c | 1 + fs/proc/self.c | 59 ++++++++++++++++++++ 5 files changed, 64 insertions(+), 152 deletions(-) create mode 100644 fs/proc/self.c (limited to 'fs') diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 99349efbbc2b..981b05601931 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += uptime.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o +proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..cbe454e94af8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2237,146 +2237,6 @@ static const struct file_operations proc_coredump_filter_operations = { }; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; -} - -static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = proc_self_put_link, -}; - -/* - * proc base - * - * These are the directory entries in the root directory of /proc - * that properly belong to the /proc filesystem, as they describe - * describe something that is process related. - */ -static const struct pid_entry proc_base_stuff[] = { - NOD("self", S_IFLNK|S_IRWXUGO, - &proc_self_inode_operations, NULL, {}), -}; - -static struct dentry *proc_base_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct pid_entry *p = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error; - - /* Allocate the inode */ - error = ERR_PTR(-ENOMEM); - inode = new_inode(dir->i_sb); - if (!inode) - goto out; - - /* Initialize the inode */ - ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - /* - * grab the reference to the task. - */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) - goto out_iput; - - inode->i_mode = p->mode; - if (S_ISDIR(inode->i_mode)) - set_nlink(inode, 2); - if (S_ISLNK(inode->i_mode)) - inode->i_size = 64; - if (p->iop) - inode->i_op = p->iop; - if (p->fop) - inode->i_fop = p->fop; - ei->op = p->op; - d_add(dentry, inode); - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - /* Lookup the directory entry */ - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; - for (p = proc_base_stuff; p <= last; p++) { - if (p->len != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) - break; - } - if (p > last) - goto out; - - error = proc_base_instantiate(dir, dentry, task, p); - -out: - put_task_struct(task); -out_no_task: - return error; -} - -static int proc_base_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_base_instantiate, task, p); -} - #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { @@ -2767,15 +2627,11 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result; + struct dentry *result = NULL; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - result = proc_base_lookup(dir, dentry); - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) - goto out; - tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2838,7 +2694,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2872,12 +2728,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) if (!reaper) goto out_no_task; - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { - const struct pid_entry *p = &proc_base_stuff[nr]; - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) - goto out; - } - ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; iter.tgid = filp->f_pos - TGID_OFFSET; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 43973b084abf..252544c05207 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -15,6 +15,7 @@ struct ctl_table_header; struct mempolicy; extern struct proc_dir_entry proc_root; +extern void proc_self_init(void); #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *head); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9889a92d2e01..5da984959edc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -169,6 +169,7 @@ void __init proc_root_init(void) return; } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/fs/proc/self.c b/fs/proc/self.c new file mode 100644 index 000000000000..aa5cc3bff140 --- /dev/null +++ b/fs/proc/self.c @@ -0,0 +1,59 @@ +#include +#include +#include + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +void __init proc_self_init(void) +{ + struct proc_dir_entry *proc_self_symlink; + mode_t mode; + + mode = S_IFLNK | S_IRWXUGO; + proc_self_symlink = proc_create("self", mode, NULL, NULL ); + proc_self_symlink->proc_iops = &proc_self_inode_operations; +} -- cgit v1.2.1 From ae06c7c83fc6e97ba247a261921c101960f3d28f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 15:23:34 -0700 Subject: procfs: Don't cache a pid in the root inode. Now that we have s_fs_info pointing to our pid namespace the original reason for the proc root inode having a struct pid is gone. Caching a pid in the root inode has led to some complicated code. Now that we don't need the struct pid, just remove it. Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 11 +---------- fs/proc/root.c | 8 -------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cbe454e94af8..6177fc238fdb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2714,19 +2714,12 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr; - struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out_no_task; - nr = filp->f_pos - FIRST_PROCESS_ENTRY; - - reaper = get_proc_task(filp->f_path.dentry->d_inode); - if (!reaper) - goto out_no_task; + goto out; ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -2747,8 +2740,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: - put_task_struct(reaper); -out_no_task: return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 5da984959edc..13ef6247e7a3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -100,7 +100,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { @@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - return dget(sb->s_root); } -- cgit v1.2.1 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- fs/hppfs/hppfs.c | 2 +- fs/proc/root.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 78f21f8dc2ec..43b315f2002b 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/fs/proc/root.c b/fs/proc/root.c index 13ef6247e7a3..fc1609321a78 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } -- cgit v1.2.1 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- 2 files changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fdb..7621dc51cff8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a78..f2f251158d35 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); -- cgit v1.2.1 From 57e8391d327609cbf12d843259c968b9e5c1838f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:17:03 -0800 Subject: pidns: Add setns support - Pid namespaces are designed to be inescapable so verify that the passed in pid namespace is a child of the currently active pid namespace or the currently active pid namespace itself. Allowing the currently active pid namespace is important so the effects of an earlier setns can be cancelled. Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b178ed733c36..85ca047e35f1 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -24,6 +24,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_IPC_NS &ipcns_operations, #endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif }; static const struct file_operations ns_file_operations = { -- cgit v1.2.1 From a85fb273c94648cbf20a5f9bcf8bbbb075f271ad Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Jul 2012 01:14:12 -0700 Subject: vfs: Allow chroot if you have CAP_SYS_CHROOT in your user namespace Once you are confined to a user namespace applications can not gain privilege and escape the user namespace so there is no longer a reason to restrict chroot. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 59071f55bf7f..182d8667b7bd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) goto dput_and_out; error = -EPERM; - if (!capable(CAP_SYS_CHROOT)) + if (!nsown_capable(CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) -- cgit v1.2.1 From 8823c079ba7136dc1948d6f6dcb5f8022bde438e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:49:36 -0800 Subject: vfs: Add setns support for the mount namespace setns support for the mount namespace is a little tricky as an arbitrary decision must be made about what to set fs->root and fs->pwd to, as there is no expectation of a relationship between the two mount namespaces. Therefore I arbitrarily find the root mount point, and follow every mount on top of it to find the top of the mount stack. Then I set fs->root and fs->pwd to that location. The topmost root of the mount stack seems like a reasonable place to be. Bind mount support for the mount namespace inodes has the possibility of creating circular dependencies between mount namespaces. Circular dependencies can result in loops that prevent mount namespaces from every being freed. I avoid creating those circular dependencies by adding a sequence number to the mount namespace and require all bind mounts be of a younger mount namespace into an older mount namespace. Add a helper function proc_ns_inode so it is possible to detect when we are attempting to bind mound a namespace inode. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/mount.h | 1 + fs/namespace.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/proc/namespaces.c | 5 +++ 3 files changed, 101 insertions(+) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index 4f291f9de641..e9c37dd3d00d 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -6,6 +6,7 @@ struct mnt_namespace { atomic_t count; struct mount * root; struct list_head list; + u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; int event; }; diff --git a/fs/namespace.c b/fs/namespace.c index 24960626bb6b..d287e7e74644 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,6 +20,7 @@ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ #include +#include #include "pnode.h" #include "internal.h" @@ -1308,6 +1309,26 @@ static int mount_is_safe(struct path *path) #endif } +static bool mnt_ns_loop(struct path *path) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct inode *inode = path->dentry->d_inode; + struct proc_inode *ei; + struct mnt_namespace *mnt_ns; + + if (!proc_ns_inode(inode)) + return false; + + ei = PROC_I(inode); + if (ei->ns_ops != &mntns_operations) + return false; + + mnt_ns = ei->ns; + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; +} + struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { @@ -1655,6 +1676,10 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; + err = -EINVAL; + if (mnt_ns_loop(&old_path)) + goto out; + err = lock_mount(path); if (err) goto out; @@ -2261,6 +2286,15 @@ dput_out: return retval; } +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + static struct mnt_namespace *alloc_mnt_ns(void) { struct mnt_namespace *new_ns; @@ -2268,6 +2302,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); @@ -2681,3 +2716,63 @@ bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } + +static void *mntns_get(struct task_struct *task) +{ + struct mnt_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->mnt_ns; + get_mnt_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void mntns_put(void *ns) +{ + put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = ns; + struct path root; + + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_CHROOT)) + return -EINVAL; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, +}; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 85ca047e35f1..2a17fd9ae6a9 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -27,6 +27,7 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_PID_NS &pidns_operations, #endif + &mntns_operations, }; static const struct file_operations ns_file_operations = { @@ -201,3 +202,7 @@ out_invalid: return ERR_PTR(-EINVAL); } +bool proc_ns_inode(struct inode *inode) +{ + return inode->i_fop == &ns_file_operations; +} -- cgit v1.2.1 From 771b1371686e0a63e938ada28de020b9a0040f55 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 21:08:32 -0700 Subject: vfs: Add a user namespace reference from struct mnt_namespace This will allow for support for unprivileged mounts in a new user namespace. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/mount.h | 1 + fs/namespace.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index e9c37dd3d00d..630fafc616bb 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -6,6 +6,7 @@ struct mnt_namespace { atomic_t count; struct mount * root; struct list_head list; + struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; int event; diff --git a/fs/namespace.c b/fs/namespace.c index d287e7e74644..207c7ba84ad3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2286,6 +2287,12 @@ dput_out: return retval; } +static void free_mnt_ns(struct mnt_namespace *ns) +{ + put_user_ns(ns->user_ns); + kfree(ns); +} + /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current @@ -2295,7 +2302,7 @@ dput_out: */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(void) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; @@ -2308,6 +2315,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); return new_ns; } @@ -2316,7 +2324,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) * copied from the namespace of the passed in task structure. */ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) + struct user_namespace *user_ns, struct fs_struct *fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; @@ -2324,7 +2332,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct mount *old = mnt_ns->root; struct mount *new; - new_ns = alloc_mnt_ns(); + new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns; @@ -2333,7 +2341,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); if (IS_ERR(new)) { up_write(&namespace_sem); - kfree(new_ns); + free_mnt_ns(new_ns); return ERR_CAST(new); } new_ns->root = new; @@ -2374,7 +2382,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct fs_struct *new_fs) + struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -2384,7 +2392,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (!(flags & CLONE_NEWNS)) return ns; - new_ns = dup_mnt_ns(ns, new_fs); + new_ns = dup_mnt_ns(ns, user_ns, new_fs); put_mnt_ns(ns); return new_ns; @@ -2396,7 +2404,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, */ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); if (!IS_ERR(new_ns)) { struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; @@ -2682,7 +2690,7 @@ void put_mnt_ns(struct mnt_namespace *ns) br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - kfree(ns); + free_mnt_ns(ns); } struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) -- cgit v1.2.1 From 7a472ef4be8387bc05a42e16309b02c8ca943a40 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Jul 2012 13:13:04 -0700 Subject: vfs: Only support slave subtrees across different user namespaces Sharing mount subtress with mount namespaces created by unprivileged users allows unprivileged mounts created by unprivileged users to propagate to mount namespaces controlled by privileged users. Prevent nasty consequences by changing shared subtrees to slave subtress when an unprivileged users creates a new mount namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 11 ++++++++--- fs/pnode.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 207c7ba84ad3..4dfcaf05d17c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -786,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE)) + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -807,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); br_write_unlock(&vfsmount_lock); - if (flag & CL_SLAVE) { + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); @@ -2331,6 +2332,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct mount *p, *q; struct mount *old = mnt_ns->root; struct mount *new; + int copy_flags; new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) @@ -2338,7 +2340,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, down_write(&namespace_sem); /* First pass: copy the tree topology */ - new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); + copy_flags = CL_COPY_ALL | CL_EXPIRE; + if (user_ns != mnt_ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); free_mnt_ns(new_ns); diff --git a/fs/pnode.h b/fs/pnode.h index 65c60979d541..19b853a3445c 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -22,6 +22,7 @@ #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 +#define CL_SHARED_TO_SLAVE 0x20 static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.1 From 0c55cfc4166d9a0f38de779bd4d75a90afbe7734 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 21:42:03 -0700 Subject: vfs: Allow unprivileged manipulation of the mount namespace. - Add a filesystem flag to mark filesystems that are safe to mount as an unprivileged user. - Add a filesystem flag to mark filesystems that don't need MNT_NODEV when mounted by an unprivileged user. - Relax the permission checks to allow unprivileged users that have CAP_SYS_ADMIN permissions in the user namespace referred to by the current mount namespace to be allowed to mount, unmount, and move filesystems. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 69 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 4dfcaf05d17c..9ddc86f93221 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1269,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto dput_and_out; retval = do_umount(mnt, flags); @@ -1295,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static int mount_is_safe(struct path *path) { - if (capable(CAP_SYS_ADMIN)) + if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1633,7 +1633,7 @@ static int do_change_type(struct path *path, int flag) int type; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (path->dentry != path->mnt->mnt_root) @@ -1797,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct mount *p; struct mount *old; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1884,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } -static struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) -{ - struct file_system_type *type = get_fs_type(fstype); - struct vfsmount *mnt; - if (!type) - return ERR_PTR(-ENODEV); - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - put_filesystem(type); - return mnt; -} - /* * add a mount into a namespace's mount tree */ @@ -1944,20 +1929,46 @@ unlock: * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *type, int flags, +static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { + struct file_system_type *type; + struct user_namespace *user_ns; struct vfsmount *mnt; int err; - if (!type) + if (!fstype) return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + user_ns = real_mount(path->mnt)->mnt_ns->user_ns; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - mnt = do_kern_mount(type, flags, name, data); + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -2549,7 +2560,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct mount *new_mnt, *root_mnt; int error; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; error = user_path_dir(new_root, &new); @@ -2631,8 +2642,13 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; + struct file_system_type *type; - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); if (IS_ERR(mnt)) panic("Can't create rootfs"); @@ -2757,7 +2773,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct mnt_namespace *mnt_ns = ns; struct path root; - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_CHROOT)) + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_CHROOT)) return -EINVAL; if (fs->users != 1) -- cgit v1.2.1 From ae11e0f18482bfe0cd83b9b61434ea7e0bd94e25 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 13 Sep 2012 16:38:03 +0800 Subject: userns: fix return value on mntns_install() failure Change return value from -EINVAL to -EPERM when the permission check fails. Signed-off-by: Zhao Hongjiang Signed-off-by: Eric W. Biederman --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 9ddc86f93221..cab78a74aca3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2775,7 +2775,7 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !nsown_capable(CAP_SYS_CHROOT)) - return -EINVAL; + return -EPERM; if (fs->users != 1) return -EINVAL; -- cgit v1.2.1 From 3cdf5b45ffbac294bcdfac0393df72f7687c01e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Nov 2011 16:40:54 -0800 Subject: userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped When performing an exec where the binary lives in one user namespace and the execing process lives in another usre namespace there is the possibility that the target uids can not be represented. Instead of failing the exec simply ignore the suid/sgid bits and run the binary with lower privileges. We already do this in the case of MNT_NOSUID so this should be a well tested code path. As the user and group are not changed this should not introduce any security issues. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/exec.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0039055b1fc6..aef0c2f19750 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs) { + !current->no_new_privs && + kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && + kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ if (mode & S_ISUID) { - if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) - return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = inode->i_uid; - } /* Set-gid? */ @@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) - return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = inode->i_gid; } -- cgit v1.2.1 From 3bb3e1fc47aca554e7e2cc4deeddc24750987ac2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 14:55:52 +0100 Subject: reiserfs: Fix lock ordering during remount When remounting reiserfs dquot_suspend() or dquot_resume() can be called. These functions take dqonoff_mutex which ranks above write lock so we have to drop it before calling into quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1078ae179993..5372980ec458 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1335,7 +1335,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) kfree(qf_names[i]); #endif err = -EINVAL; - goto out_err; + goto out_unlock; } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1379,7 +1379,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (blocks) { err = reiserfs_resize(s, blocks); if (err != 0) - goto out_err; + goto out_unlock; } if (*mount_flags & MS_RDONLY) { @@ -1389,9 +1389,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* it is read-only already */ goto out_ok; + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_suspend() without it. + */ + reiserfs_write_unlock(s); err = dquot_suspend(s, -1); if (err < 0) goto out_err; + reiserfs_write_lock(s); /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS @@ -1401,7 +1407,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1416,7 +1422,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (reiserfs_is_journal_aborted(journal)) { err = journal->j_errno; - goto out_err; + goto out_unlock; } handle_data_mode(s, mount_options); @@ -1425,7 +1431,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1442,10 +1448,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) SB_JOURNAL(s)->j_must_wait = 1; err = journal_end(&th, s, 10); if (err) - goto out_err; + goto out_unlock; if (!(*mount_flags & MS_RDONLY)) { + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_resume() without it. + */ + reiserfs_write_unlock(s); dquot_resume(s, -1); + reiserfs_write_lock(s); finish_unfinished(s); reiserfs_xattr_init(s, *mount_flags); } @@ -1455,9 +1467,10 @@ out_ok: reiserfs_write_unlock(s); return 0; +out_unlock: + reiserfs_write_unlock(s); out_err: kfree(new_opts); - reiserfs_write_unlock(s); return err; } -- cgit v1.2.1 From b9e06ef2e8706fe669b51f4364e3aeed58639eb2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 16:34:17 +0100 Subject: reiserfs: Protect reiserfs_quota_on() with write lock In reiserfs_quota_on() we do quite some work - for example unpacking tail of a quota file. Thus we have to hold write lock until a moment we call back into the quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5372980ec458..e59d6ddcc69f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2216,8 +2216,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, struct reiserfs_transaction_handle th; int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) - return -EINVAL; + reiserfs_write_lock(sb); + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { + err = -EINVAL; + goto out; + } /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) { @@ -2259,8 +2262,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = dquot_quota_on(sb, type, format_id, path); + reiserfs_write_unlock(sb); + return dquot_quota_on(sb, type, format_id, path); out: + reiserfs_write_unlock(sb); return err; } -- cgit v1.2.1 From 361d94a338a3fd0cee6a4ea32bbc427ba228e628 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 18:25:38 +0100 Subject: reiserfs: Protect reiserfs_quota_write() with write lock Calls into reiserfs journalling code and reiserfs_get_block() need to be protected with write lock. We remove write lock around calls to high level quota code in the next patch so these paths would suddently become unprotected. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e59d6ddcc69f..c101704ece48 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2338,7 +2338,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + reiserfs_write_unlock(sb); if (err) goto out; if (offset || tocopy != sb->s_blocksize) @@ -2354,10 +2356,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); unlock_buffer(bh); + reiserfs_write_lock(sb); reiserfs_prepare_for_journal(sb, bh, 1); journal_mark_dirty(current->journal_info, sb, bh); if (!journal_quota) reiserfs_add_ordered_list(inode, bh); + reiserfs_write_unlock(sb); brelse(bh); offset = 0; towrite -= tocopy; -- cgit v1.2.1 From 7af11686933726e99af22901d622f9e161404e6b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 17:05:14 +0100 Subject: reiserfs: Move quota calls out of write lock Calls into highlevel quota code cannot happen under the write lock. These calls take dqio_mutex which ranks above write lock. So drop write lock before calling back into quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/inode.c | 10 +++++++--- fs/reiserfs/stree.c | 4 ++++ fs/reiserfs/super.c | 18 ++++++++++++++---- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f27f01a98aa2..d83736fbc26c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - dquot_initialize(inode); + reiserfs_write_unlock(inode->i_sb); err = dquot_alloc_inode(inode); + reiserfs_write_lock(inode->i_sb); if (err) goto out_end_trans; if (!dir->i_nlink) { @@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); /* Drop can be outside and it needs more credits so it's better to have it outside */ dquot_drop(inode); + reiserfs_write_lock(inode->i_sb); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - depth = reiserfs_write_lock_once(inode->i_sb); if (is_quota_modification(inode, attr)) dquot_initialize(inode); - + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) error = journal_begin(&th, inode->i_sb, jbegin_count); if (error) goto out; + reiserfs_write_unlock_once(inode->i_sb, depth); error = dquot_transfer(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); if (error) { journal_end(&th, inode->i_sb, jbegin_count); goto out; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index f8afa4b162b8..2f40a4c70a4d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif + reiserfs_write_unlock(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, pasted_size); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(search_path); return retval; @@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif + reiserfs_write_unlock(inode->i_sb); /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ retval = dquot_alloc_space_nodirty(inode, quota_bytes); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(path); return retval; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c101704ece48..418bdc3a57da 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } + reiserfs_write_unlock(s); dquot_initialize(inode); + reiserfs_write_lock(s); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -2108,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot) REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_commit(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2130,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_acquire(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2150,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot) ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + reiserfs_write_unlock(dquot->dq_sb); if (ret) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); goto out; } ret = dquot_release(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: reiserfs_write_unlock(dquot->dq_sb); +out: return ret; } @@ -2187,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type) ret = journal_begin(&th, sb, 2); if (ret) goto out; + reiserfs_write_unlock(sb); ret = dquot_commit_info(sb, type); + reiserfs_write_lock(sb); err = journal_end(&th, sb, 2); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(sb); return ret; } -- cgit v1.2.1 From ae49eeec785025373e28dc24c8351c6bba688d99 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Oct 2012 12:28:38 +0200 Subject: ext3: Avoid underflow of in ext3_trim_fs() Currently if len argument in ext3_trim_fs() is smaller than one block, the 'end' variable underflow. Avoid that by returning EINVAL if len is smaller than file system block. Also remove useless unlikely(). Signed-off-by: Lukas Czerner Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 7320a66e958f..22548f56197b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || - unlikely(start >= max_blks)) + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) return -EINVAL; if (end >= max_blks) end = max_blks - 1; -- cgit v1.2.1 From bc02e8693d875c2a9b0037cfd37fe0b726d26403 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Nov 2012 09:20:37 +1100 Subject: xfs: add CRC infrastructure - add a mount feature bit for CRC enabled filesystems - add some helpers for generating and verifying the CRCs - add a copy_uuid helper The checksumming helpers are loosely based on similar ones in sctp, all other bits come from Dave Chinner. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Kconfig | 1 + fs/xfs/uuid.h | 6 ++++++ fs/xfs/xfs_cksum.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_sb.h | 7 ++++++ 5 files changed, 78 insertions(+) create mode 100644 fs/xfs/xfs_cksum.h (limited to 'fs') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d4..5a7ffe54f5d5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262cc..104db0f3bed6 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 000000000000..fad1676ad8cd --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 0a134ca5211c..fe7e4df85a7b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f429d9d5d325..a05b45175fb0 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -81,6 +81,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); } +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); +} + /* * end of superblock version macros */ -- cgit v1.2.1 From 0e446be44806240c779666591bb9e8cb0e86a50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Nov 2012 22:54:24 +1100 Subject: xfs: add CRC checks to the log Implement CRCs for the log buffers. We re-use a field in struct xlog_rec_header that was used for a weak checksum of the log buffer payload in debug builds before. The new checksumming uses the crc32c checksum we will use elsewhere in XFS, and also protects the record header and addition cycle data. Due to this there are some interesting changes in xlog_sync, as we need to do the cycle wrapping for the split buffer case much earlier, as we would touch the buffer after generating the checksum otherwise. The CRC calculation is always enabled, even for non-CRC filesystems, as adding this CRC does not change the log format. On non-CRC filesystems, only issue an alert if a CRC mismatch is found and allow recovery to continue - this will act as an indicator that log recovery problems are a result of log corruption. On CRC enabled filesystems, however, log recovery will fail. Note that existing debug kernels will write a simple checksum value to the log, so the first time this is run on a filesystem taht was last used on a debug kernel it will through CRC mismatch warning errors. These can be ignored. Initially based on a patch from Dave Chinner, then modified significantly by Christoph Hellwig. Modified again by Dave Chinner to get to this version. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 132 ++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log_priv.h | 11 ++-- fs/xfs/xfs_log_recover.c | 132 ++++++++++++++++++++++------------------------- 3 files changed, 176 insertions(+), 99 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1d6d2ee08495..c6d6e136ba77 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -35,6 +35,7 @@ #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -1489,6 +1490,84 @@ xlog_grant_push_ail( xfs_ail_push(log->l_ailp, threshold_lsn); } +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__be32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + /* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to @@ -1549,7 +1628,6 @@ xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1558,6 +1636,7 @@ xlog_sync( int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1588,13 +1667,10 @@ xlog_sync( xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1603,12 +1679,36 @@ xlog_sync( /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1662,19 +1762,6 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1691,7 +1778,6 @@ xlog_sync( return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9a4e0e5ec322..dc3498bf17c2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -555,11 +554,9 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int); + +extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 931e8e23f192..9c3651c9e75b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,6 +41,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -3216,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - struct xlog *log, - struct xlog_in_core *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __be32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + be32_to_cpu(rhead->h_crc), + be32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3306,6 +3285,8 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int @@ -3437,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3549,9 +3534,14 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3576,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } -- cgit v1.2.1 From 7fa294c8991ce0ed4e713f08209eb2ce3e1044ac Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 2 Sep 2012 19:12:51 -0700 Subject: userns: Allow chown and setgid preservation - Allow chown if CAP_CHOWN is present in the current user namespace and the uid of the inode maps into the current user namespace, and the destination uid or gid maps into the current user namespace. - Allow perserving setgid when changing an inode if CAP_FSETID is present in the current user namespace and the owner of the file has a mapping into the current user namespace. Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/attr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index cce7df53b694..1449adb14ef6 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) + !uid_eq(attr->ia_uid, inode->i_uid)) && + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !capable(CAP_CHOWN)) + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && !capable(CAP_FSETID)) + inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } -- cgit v1.2.1 From cde1975bc242f3e1072bde623ef378e547b73f91 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 06:24:06 -0700 Subject: userns: Implent proc namespace operations This allows entering a user namespace, and the ability to store a reference to a user namespace with a bind mount. Addition of missing userns_ns_put in userns_install from Gao feng Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/namespaces.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 2a17fd9ae6a9..030250c27d70 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -26,6 +27,9 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, #endif &mntns_operations, }; -- cgit v1.2.1 From e9f238c3041e2582a710e75910c8cbf2a98e51b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 11 Aug 2012 12:38:26 -0700 Subject: procfs: Print task uids and gids in the userns that opened the proc file Instead of using current_userns() use the userns of the opener of the file so that if the file is passed between processes the contents of the file do not change. Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36cae..554434265613 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { - struct user_namespace *user_ns = current_user_ns(); + struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; struct fdtable *fdt = NULL; -- cgit v1.2.1 From 4f326c0064b20b78b8041f4d2f6fe188a1129f18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 27 Jul 2012 05:56:48 -0700 Subject: userns: Allow unprivilged mounts of proc and sysfs - The context in which proc and sysfs are mounted have no effect on the the uid/gid of their files so no conversion is needed except allowing the mount. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/root.c | 1 + fs/sysfs/mount.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index f2f251158d35..c6e9fac26bac 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -145,6 +145,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 71eb7e253927..db940a9be045 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) -- cgit v1.2.1 From 33d6dce607573b5fd7a43168e0d91221b3ca532b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 17 Jun 2011 13:33:20 -0700 Subject: proc: Generalize proc inode allocation Generalize the proc inode allocation so that it can be used without having to having to create a proc_dir_entry. This will allow namespace file descriptors to remain light weight entitities but still have the same inode number when the backing namespace is the same. Acked-by: Serge E. Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/generic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 0d80cef4cfb9..7b3ae3cc0ef9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ -static unsigned int get_inode_number(void) +int proc_alloc_inum(unsigned int *inum) { unsigned int i; int error; retry: - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) - return 0; + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; spin_lock(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); @@ -365,18 +365,19 @@ retry: if (error == -EAGAIN) goto retry; else if (error) - return 0; + return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); - return 0; + return -ENOSPC; } - return PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + i; + return 0; } -static void release_inode_number(unsigned int inum) +void proc_free_inum(unsigned int inum) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); @@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - unsigned int i; struct proc_dir_entry *tmp; + int ret; - i = get_inode_number(); - if (i == 0) - return -EAGAIN; - dp->low_ino = i; + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { @@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - release_inode_number(de->low_ino); + proc_free_inum(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); -- cgit v1.2.1 From bf056bfa80596a5d14b26b17276a56a0dcb080e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 18 Jun 2011 17:48:18 -0700 Subject: proc: Fix the namespace inode permission checks. Change the proc namespace files into symlinks so that we won't cache the dentries for the namespace files which can bypass the ptrace_may_access checks. To support the symlinks create an additional namespace inode with it's own set of operations distinct from the proc pid inode and dentry methods as those no longer make sense. Signed-off-by: Eric W. Biederman --- fs/proc/inode.c | 6 +- fs/proc/namespaces.c | 169 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 152 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3b22bbdee9ec..439ae6886507 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode) struct proc_dir_entry *de; struct ctl_table_header *head; const struct proc_ns_operations *ns_ops; + void *ns; truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode) } /* Release any associated namespace */ ns_ops = PROC_I(inode)->ns_ops; - if (ns_ops && ns_ops->put) - ns_ops->put(PROC_I(inode)->ns); + ns = PROC_I(inode)->ns; + if (ns_ops && ns) + ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 030250c27d70..7a6d8d69cdb8 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,6 +38,151 @@ static const struct file_operations ns_file_operations = { .llseek = no_llseek, }; +static const struct inode_operations ns_inode_operations = { + .setattr = proc_setattr, +}; + +static int ns_delete_dentry(const struct dentry *dentry) +{ + /* Don't cache namespace inodes when not in use */ + return 1; +} + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_delete = ns_delete_dentry, + .d_dname = ns_dname, +}; + +static struct dentry *proc_ns_get_dentry(struct super_block *sb, + struct task_struct *task, const struct proc_ns_operations *ns_ops) +{ + struct dentry *dentry, *result; + struct inode *inode; + struct proc_inode *ei; + struct qstr qname = { .name = "", }; + void *ns; + + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + dentry = d_alloc_pseudo(sb, &qname); + if (!dentry) { + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + inode = new_inode(sb); + if (!inode) { + dput(dentry); + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + + d_set_d_op(dentry, &ns_dentry_operations); + result = d_instantiate_unique(dentry, inode); + if (result) { + dput(dentry); + dentry = result; + } + + return dentry; +} + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct proc_inode *ei = PROC_I(inode); + struct task_struct *task; + struct dentry *ns_dentry; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_dentry)) { + error = ERR_CAST(ns_dentry); + goto out_put_task; + } + + dput(nd->path.dentry); + nd->path.dentry = ns_dentry; + error = NULL; + +out_put_task: + put_task_struct(task); +out: + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = ei->ns_ops; + struct task_struct *task; + void *ns; + char name[50]; + int len = -EACCES; + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + len = -ENOENT; + ns = ns_ops->get(task); + if (!ns) + goto out_put_task; + + snprintf(name, sizeof(name), "%s", ns_ops->name); + len = strlen(name); + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, ns_ops->name, len)) + len = -EFAULT; + + ns_ops->put(ns); +out_put_task: + put_task_struct(task); +out: + return len; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + static struct dentry *proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -45,21 +190,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); - void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; - ns = ns_ops->get(task); - if (!ns) - goto out_iput; - ei = PROC_I(inode); - inode->i_mode = S_IFREG|S_IRUSR; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -68,9 +207,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, error = NULL; out: return error; -out_iput: - iput(inode); - goto out; } static int proc_ns_fill_cache(struct file *filp, void *dirent, @@ -97,10 +233,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent, if (!task) goto out_no_task; - ret = -EPERM; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - ret = 0; i = filp->f_pos; switch (i) { @@ -160,10 +292,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!task) goto out_no_task; - error = ERR_PTR(-EPERM); - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) @@ -171,7 +299,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } - error = ERR_PTR(-ENOENT); if (entry == last) goto out; -- cgit v1.2.1 From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 10:21:48 -0700 Subject: proc: Usable inode numbers for the namespace file descriptors. Assign a unique proc inode to each namespace, and use that inode number to ensure we only allocate at most one proc inode for every namespace in proc. A single proc inode per namespace allows userspace to test to see if two processes are in the same namespace. This has been a long requested feature and only blocked because a naive implementation would put the id in a global space and would ultimately require having a namespace for the names of namespaces, making migration and certain virtualization tricks impossible. We still don't have per superblock inode numbers for proc, which appears necessary for application unaware checkpoint/restart and migrations (if the application is using namespace file descriptors) but that is now allowd by the design if it becomes important. I have preallocated the ipc and uts initial proc inode numbers so their structures can be statically initialized. Signed-off-by: Eric W. Biederman --- fs/mount.h | 1 + fs/namespace.c | 14 ++++++++++++++ fs/proc/namespaces.c | 24 ++++++++++++++---------- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index 630fafc616bb..cd5007980400 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -4,6 +4,7 @@ struct mnt_namespace { atomic_t count; + unsigned int proc_inum; struct mount * root; struct list_head list; struct user_namespace *user_ns; diff --git a/fs/namespace.c b/fs/namespace.c index cab78a74aca3..c1bbe86f4920 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2301,6 +2301,7 @@ dput_out: static void free_mnt_ns(struct mnt_namespace *ns) { + proc_free_inum(ns->proc_inum); put_user_ns(ns->user_ns); kfree(ns); } @@ -2317,10 +2318,16 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; @@ -2799,10 +2806,17 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .inum = mntns_inum, }; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 7a6d8d69cdb8..b7a47196c8c3 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, return ERR_PTR(-ENOMEM); } - inode = new_inode(sb); + inode = iget_locked(sb, ns_ops->inum(ns)); if (!inode) { dput(dentry); ns_ops->put(ns); @@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, } ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } d_set_d_op(dentry, &ns_dentry_operations); result = d_instantiate_unique(dentry, inode); @@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ns) goto out_put_task; - snprintf(name, sizeof(name), "%s", ns_ops->name); + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); len = strlen(name); if (len > buflen) len = buflen; - if (copy_to_user(buffer, ns_ops->name, len)) + if (copy_to_user(buffer, name, len)) len = -EFAULT; ns_ops->put(ns); -- cgit v1.2.1 From 6bdb5f213c4344324f600dde885f25768fbd14db Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Mon, 12 Nov 2012 16:55:38 -0500 Subject: NFS: Add sequence_priviliged_ops for nfs4_proc_sequence() If I mount an NFS v4.1 server to a single client multiple times and then run xfstests over each mountpoint I usually get the client into a state where recovery deadlocks. The server informs the client of a cb_path_down sequence error, the client then does a bind_connection_to_session and checks the status of the lease. I found that bind_connection_to_session sets the NFS4_SESSION_DRAINING flag on the client, but this flag is never unset before nfs4_check_lease() reaches nfs4_proc_sequence(). This causes the client to deadlock, halting all NFS activity to the server. nfs4_proc_sequence() is only called by the state manager, so I can change it to run in privileged mode to bypass the NFS4_SESSION_DRAINING check and avoid the deadlock. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6300cdd81101..a32d953b08de 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6136,13 +6136,26 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) rpc_call_start(task); } +static void nfs41_sequence_prepare_privileged(struct rpc_task *task, void *data) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs41_sequence_prepare(task, data); +} + static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_call_done = nfs41_sequence_call_done, .rpc_call_prepare = nfs41_sequence_prepare, .rpc_release = nfs41_sequence_release, }; -static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static const struct rpc_call_ops nfs41_sequence_privileged_ops = { + .rpc_call_done = nfs41_sequence_call_done, + .rpc_call_prepare = nfs41_sequence_prepare_privileged, + .rpc_release = nfs41_sequence_release, +}; + +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred, + const struct rpc_call_ops *seq_ops) { struct nfs4_sequence_data *calldata; struct rpc_message msg = { @@ -6152,7 +6165,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, - .callback_ops = &nfs41_sequence_ops, + .callback_ops = seq_ops, .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, }; @@ -6179,7 +6192,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return 0; - task = _nfs41_proc_sequence(clp, cred); + task = _nfs41_proc_sequence(clp, cred, &nfs41_sequence_ops); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -6193,7 +6206,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred); + task = _nfs41_proc_sequence(clp, cred, &nfs41_sequence_privileged_ops); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; -- cgit v1.2.1 From 5df904aeb0d9baad90e78fc730dfe1afa4996005 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Nov 2012 09:22:14 -0500 Subject: NFSv4.1: Handle session reset and bind_conn_to_session before lease check We can't send a SEQUENCE op unless the session is OK, so it is pointless to handle the CHECK_LEASE state before we've dealt with SESSION_RESET and BIND_CONN_TO_SESSION. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e0a28dffd29d..f3d1bc48c9c4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2114,15 +2114,6 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } - if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { - section = "check lease"; - status = nfs4_check_lease(clp); - if (status < 0) - goto out_error; - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) - continue; - } - /* Initialize or reset the session */ if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { section = "reset session"; @@ -2143,6 +2134,14 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + section = "check lease"; + status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; + continue; + } + /* Recall session slots */ if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { section = "recall slot"; -- cgit v1.2.1 From ae72ae676045274c82f3c25159a9dd7cfcf5ffae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 11:02:55 -0500 Subject: NFSv4.1: Don't confuse CREATE_SESSION arguments and results Don't store the target request and response sizes in the same variables used to store the server's replies to those targets. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a32d953b08de..3e572dc316e4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5807,8 +5807,8 @@ void nfs4_destroy_session(struct nfs4_session *session) static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) { struct nfs4_session *session = args->client->cl_session; - unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, - mxresp_sz = session->fc_attrs.max_resp_sz; + unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, + mxresp_sz = session->fc_target_max_resp_sz; if (mxrqst_sz == 0) mxrqst_sz = NFS_MAX_FILE_IO_SIZE; @@ -6015,24 +6015,28 @@ int nfs4_init_session(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs4_session *session; - unsigned int rsize, wsize; + unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; + unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; if (!nfs4_has_session(clp)) return 0; + if (server->rsize != 0) + target_max_resp_sz = server->rsize; + target_max_resp_sz += nfs41_maxread_overhead; + + if (server->wsize != 0) + target_max_rqst_sz = server->wsize; + target_max_rqst_sz += nfs41_maxwrite_overhead; + session = clp->cl_session; spin_lock(&clp->cl_lock); if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - - rsize = server->rsize; - if (rsize == 0) - rsize = NFS_MAX_FILE_IO_SIZE; - wsize = server->wsize; - if (wsize == 0) - wsize = NFS_MAX_FILE_IO_SIZE; - - session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; - session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + /* Initialise targets and channel attributes */ + session->fc_target_max_rqst_sz = target_max_rqst_sz; + session->fc_attrs.max_rqst_sz = target_max_rqst_sz; + session->fc_target_max_resp_sz = target_max_resp_sz; + session->fc_attrs.max_resp_sz = target_max_resp_sz; } spin_unlock(&clp->cl_lock); -- cgit v1.2.1 From 688a9024e2bc8d07cdc62e287dfb048722cf96df Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 10:53:39 -0500 Subject: NFSv4.1: Adjust CREATE_SESSION arguments when mounting a new filesystem If we're mounting a new filesystem, ensure that the session has negotiated large enough request and reply sizes to match the wsize and rsize mount arguments. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3e572dc316e4..ee82cdddeebe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6037,9 +6037,22 @@ int nfs4_init_session(struct nfs_server *server) session->fc_attrs.max_rqst_sz = target_max_rqst_sz; session->fc_target_max_resp_sz = target_max_resp_sz; session->fc_attrs.max_resp_sz = target_max_resp_sz; + } else { + /* Just adjust the targets */ + if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { + session->fc_target_max_rqst_sz = target_max_rqst_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } + if (target_max_resp_sz > session->fc_target_max_resp_sz) { + session->fc_target_max_resp_sz = target_max_resp_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } } spin_unlock(&clp->cl_lock); + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + nfs4_schedule_lease_recovery(clp); + return nfs41_check_session_ready(clp); } -- cgit v1.2.1 From 43095d397219aa1898db23937b03c1215ef16a37 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 11:13:12 -0500 Subject: NFSv4.1: We must bump the clientid sequence number after CREATE_SESSION We must always bump the clientid sequence number after a successful call to CREATE_SESSION on the server. The result of nfs4_verify_channel_attrs() is irrelevant to that requirement. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee82cdddeebe..1ac339b4f092 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5917,10 +5917,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (!status) + if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, session); - if (!status) { /* Increment the clientid slot sequence id */ clp->cl_seqid++; } -- cgit v1.2.1 From 2d473d378eb571ad77f9563653639aa35e22d39c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Nov 2012 18:03:22 -0500 Subject: NFSv4.1: nfs4_alloc_slots doesn't need zeroing All that memory is going to be initialised to non-zero by nfs4_add_and_init_slots anyway. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1ac339b4f092..0402ebb9b490 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5658,7 +5658,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) { - return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); + return kmalloc_array(max_slots, sizeof(struct nfs4_slot), gfp_flags); } static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, -- cgit v1.2.1 From 9216106a847a53e6d0fe6d11dfd9175f2ca7fccf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Nov 2012 19:50:45 -0500 Subject: NFSv4.1: clean up nfs4_recall_slot to use nfs4_alloc_slots Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4state.c | 3 +-- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a525fdefccde..36880b9aa91e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -258,6 +258,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); +extern struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags); + static inline bool is_ds_only_client(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0402ebb9b490..5e5cc5a5065f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5656,7 +5656,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } -static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) { return kmalloc_array(max_slots, sizeof(struct nfs4_slot), gfp_flags); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f3d1bc48c9c4..96fcbb97fd6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2033,8 +2033,7 @@ static int nfs4_recall_slot(struct nfs_client *clp) return 0; nfs4_begin_drain_session(clp); fc_tbl = &clp->cl_session->fc_slot_table; - new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), - GFP_NOFS); + new = nfs4_alloc_slots(fc_tbl->target_max_slots, GFP_NOFS); if (!new) return -ENOMEM; -- cgit v1.2.1 From 933602e368c4452260c9bff4fbb3baba35cf987a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Nov 2012 12:12:38 -0500 Subject: NFSv4.1: Shrink struct nfs4_sequence_res by moving sr_renewal_time Store the renewal time inside the session slot instead. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5e5cc5a5065f..14b39742b6e4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -486,6 +486,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { + struct nfs4_slot *slot; unsigned long timestamp; struct nfs_client *clp; @@ -502,12 +503,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * if (!RPC_WAS_SENT(task)) goto out; + slot = res->sr_slot; + /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++res->sr_slot->seq_nr; - timestamp = res->sr_renewal_time; + ++slot->seq_nr; + timestamp = slot->renewal_time; clp = res->sr_session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ @@ -521,12 +524,12 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * */ dprintk("%s: slot=%td seq=%d: Operation in progress\n", __func__, - res->sr_slot - res->sr_session->fc_slot_table.slots, - res->sr_slot->seq_nr); + slot - res->sr_session->fc_slot_table.slots, + slot->seq_nr); goto out_retry; default: /* Just update the slot sequence no. */ - ++res->sr_slot->seq_nr; + ++slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ @@ -637,6 +640,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); slot = tbl->slots + slotid; + slot->renewal_time = jiffies; args->sa_session = session; args->sa_slotid = slotid; @@ -644,7 +648,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, res->sr_session = session; res->sr_slot = slot; - res->sr_renewal_time = jiffies; res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain -- cgit v1.2.1 From 1e2d9d44f3ceb7dac7cb14d2476d0a8128c8e169 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 21 Nov 2012 09:56:00 -0500 Subject: GFS2: Set gl_object during inode create This patch fixes a cluster coherency problem that occurs when one node creates a file, does several writes, then a different node tries to write to the same file. When the inode's glock is demoted, the inode wasn't synced to the media properly because the gl_object wasn't set. Later, the flush daemon noticed the uncommitted data and tried to flush it, only to discover the glock was no longer locked properly in exclusive mode. That caused an assert withdraw. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2405695febe9..2b6f5698ef18 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -702,6 +702,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; + ip->i_gl->gl_object = ip; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; -- cgit v1.2.1 From fe20d7d5eefb218b82033ba5c13cbcbd2a3d874c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Nov 2012 22:49:36 -0500 Subject: NFSv4: Fix a compile time warning when #undef CONFIG_NFS_V4_1 The function nfs4_get_machine_cred_locked is used by NFSv4.0 routines too. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a525fdefccde..ea4e36241044 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -321,13 +321,13 @@ extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); int nfs4_discover_server_trunking(struct nfs_client *clp, struct nfs_client **); int nfs40_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); #if defined(CONFIG_NFS_V4_1) -struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); -- cgit v1.2.1 From d751f748b359534d78e2b2e52b59d39f0e0540aa Mon Sep 17 00:00:00 2001 From: Jim Rees Date: Fri, 16 Nov 2012 18:12:06 -0500 Subject: NFS: Reduce stack use in encode_exchange_id() encode_exchange_id() uses more stack space than necessary, giving a compile time warning. Reduce the size of the static buffer for implementation name. Signed-off-by: Jim Rees Reviewed-by: "Adamson, Dros" Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 40836ee5dc3a..142aacb92459 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -270,6 +270,8 @@ static int nfs4_stat_to_errno(int); #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) +#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \ + sizeof(utsname()->version) + sizeof(utsname()->machine) + 8) #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ @@ -282,7 +284,7 @@ static int nfs4_stat_to_errno(int); 1 /* nii_domain */ + \ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* nii_name */ + \ - XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + XDR_QUADLEN(IMPL_NAME_LIMIT) + \ 3 /* nii_date */) #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 2 /* eir_clientid */ + \ @@ -1713,7 +1715,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - char impl_name[NFS4_OPAQUE_LIMIT]; + char impl_name[IMPL_NAME_LIMIT]; int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); @@ -1728,7 +1730,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, if (send_implementation_id && sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - <= NFS4_OPAQUE_LIMIT + 1) + <= sizeof(impl_name) + 1) len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", utsname()->sysname, utsname()->release, utsname()->version, utsname()->machine); -- cgit v1.2.1 From 25389bb207987b5774182f763b9fb65ff08761c8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 23 Nov 2012 14:03:04 +0100 Subject: jbd: Fix lock ordering bug in journal_unmap_buffer() Commit 09e05d48 introduced a wait for transaction commit into journal_unmap_buffer() in the case we are truncating a buffer undergoing commit in the page stradding i_size on a filesystem with blocksize < pagesize. Sadly we forgot to drop buffer lock before waiting for transaction commit and thus deadlock is possible when kjournald wants to lock the buffer. Fix the problem by dropping the buffer lock before waiting for transaction commit. Since we are still holding page lock (and that is OK), buffer cannot disappear under us. CC: stable@vger.kernel.org # Wherever commit 09e05d48 was taken Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 78b7f84241d4..7f5120bf0ec2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1961,7 +1961,9 @@ retry: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); log_wait_commit(journal, tid); + lock_buffer(bh); goto retry; } /* -- cgit v1.2.1 From 4c1002100898d03c5c9142ffaf58351c841ab94a Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Mon, 12 Nov 2012 09:27:37 +0800 Subject: nfs: Fix wrong slab cache in nfs_commit_mempool The slab cache in nfs_commit_mempool is wrong, and I think it is just a slip. I tested it on a x86-32 machine, the size of nfs_write_header is 544, and the size of nfs_commit_data is 408, so it works fine. It is also true that sizeof(struct nfs_write_header) > sizeof(struct nfs_commit_data) on other platforms in my opinoin. Just fix it. Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9347ab7c9574..f710e39f6ba2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1829,7 +1829,7 @@ int __init nfs_init_writepagecache(void) goto out_destroy_write_mempool; nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, - nfs_wdata_cachep); + nfs_cdata_cachep); if (nfs_commit_mempool == NULL) goto out_destroy_commit_cache; -- cgit v1.2.1 From 57d276d71aef7d8305ff002a070cb98deb2edced Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 15:22:43 -0500 Subject: nfsd: fix v4 reply caching Very embarassing: 1091006c5eb15cba56785bd5b498a8d0b9546903 "nfsd: turn on reply cache for NFSv4" missed a line, effectively leaving the reply cache off in the v4 case. I thought I'd tested that, but I guess not. This time, wrote a pynfs test to confirm it works. Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2013aa001dab..30d3784d0280 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -640,7 +640,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) } /* Store reply in cache. */ - nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); + nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); return 1; } -- cgit v1.2.1 From 447bfcc936ce28636833e89c4b82f424a291dde9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 21:53:58 -0500 Subject: nfsd4: no, we're not going to check tags for utf8 Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 406d0c4620f6..9dfad585d413 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1595,12 +1595,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) bool cachethis = false; int i; - /* - * XXX: According to spec, we should check the tag - * for UTF-8 compliance. I'm postponing this for - * now because it seems that some clients do use - * binary tags. - */ READ_BUF(4); READ32(argp->taglen); READ_BUF(argp->taglen + 8); -- cgit v1.2.1 From 8a61b18c9b13987310d0f3ba13aa04af51f02a1c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 22:28:38 -0500 Subject: nfsd4: simplify reading of opnum The comment here is totally bogus: - OP_WRITE + 1 is RELEASE_LOCKOWNER. Maybe there was some older version of the spec in which that served as a sort of OP_ILLEGAL? No idea, but it's clearly wrong now. - In any case, I can't see that the spec says anything about what to do if the client sends us less ops than promised. It's clearly nutty client behavior, and we should do whatever's easiest: returning an xdr error (even though it won't be consistent with the error on the last op returned) seems fine to me. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9dfad585d413..cfebc9c4f4c9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1624,38 +1624,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op = &argp->ops[i]; op->replay = NULL; - /* - * We can't use READ_BUF() here because we need to handle - * a missing opcode as an OP_WRITE + 1. So we need to check - * to see if we're truly at the end of our buffer or if there - * is another page we need to flip to. - */ - - if (argp->p == argp->end) { - if (argp->pagelen < 4) { - /* There isn't an opcode still on the wire */ - op->opnum = OP_WRITE + 1; - op->status = nfserr_bad_xdr; - argp->opcnt = i+1; - break; - } - - /* - * False alarm. We just hit a page boundary, but there - * is still data available. Move pointer across page - * boundary. *snip from READ_BUF* - */ - argp->p = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); - argp->pagelen = 0; - } else { - argp->end = argp->p + (PAGE_SIZE>>2); - argp->pagelen -= PAGE_SIZE; - } - } - op->opnum = ntohl(*argp->p++); + READ_BUF(4); + READ32(op->opnum); if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) op->status = ops->decoders[op->opnum](argp, &op->u); -- cgit v1.2.1 From 5a80a54d21c96590d013378d8c5f65f879451ab4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 10:01:30 -0500 Subject: nfsd4: reorganize write decoding In preparation for moving some of it elsewhere. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 62 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index cfebc9c4f4c9..579dc707bad9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1139,12 +1139,30 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify DECODE_TAIL; } +static int fill_in_write_vector(struct kvec *vec, struct kvec *head, struct page **pagelist, int buflen) +{ + int i = 1; + + vec[0].iov_base = head->iov_base; + vec[0].iov_len = min_t(int, buflen, head->iov_len); + buflen -= vec[0].iov_len; + + while (buflen) { + vec[i].iov_base = page_address(pagelist[i - 1]); + vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); + buflen -= vec[i].iov_len; + i++; + } + return i; +} + static __be32 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { int avail; - int v; int len; + struct page **pagelist; + struct kvec head; DECODE_HEAD; status = nfsd4_decode_stateid(argp, &write->wr_stateid); @@ -1167,27 +1185,29 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) __FILE__, __LINE__); goto xdr_error; } - argp->rqstp->rq_vec[0].iov_base = p; - argp->rqstp->rq_vec[0].iov_len = avail; - v = 0; - len = write->wr_buflen; - while (len > argp->rqstp->rq_vec[v].iov_len) { - len -= argp->rqstp->rq_vec[v].iov_len; - v++; - argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen >= PAGE_SIZE) { - argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; - argp->pagelen -= PAGE_SIZE; - } else { - argp->rqstp->rq_vec[v].iov_len = argp->pagelen; - argp->pagelen -= len; - } + head.iov_base = p; + head.iov_len = avail; + WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); + pagelist = argp->pagelist; + + len = XDR_QUADLEN(write->wr_buflen) << 2; + if (len >= avail) { + int pages; + + len -= avail; + + pages = len >> PAGE_SHIFT; + argp->pagelist += pages; + argp->pagelen -= pages * PAGE_SIZE; + len -= pages * PAGE_SIZE; + + argp->p = (__be32 *)page_address(argp->pagelist[0]); + argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } - argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); - argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); - argp->rqstp->rq_vec[v].iov_len = len; - write->wr_vlen = v+1; + argp->p += XDR_QUADLEN(len); + write->wr_vlen = fill_in_write_vector(argp->rqstp->rq_vec, + &head, pagelist, write->wr_buflen); + WARN_ON_ONCE(write->wr_vlen > ARRAY_SIZE(argp->rqstp->rq_vec)); DECODE_TAIL; } -- cgit v1.2.1 From 70cc7f75b1ee4161dfdea1012223db25712ab1a5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 14:16:46 -0500 Subject: nfsd4: move more write parameters into xdr argument In preparation for moving some of this elsewhere. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 20 +++++++++----------- fs/nfsd/xdr4.h | 2 ++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 579dc707bad9..cb9f9017af8f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1139,16 +1139,17 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify DECODE_TAIL; } -static int fill_in_write_vector(struct kvec *vec, struct kvec *head, struct page **pagelist, int buflen) +static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) { int i = 1; + int buflen = write->wr_buflen; - vec[0].iov_base = head->iov_base; - vec[0].iov_len = min_t(int, buflen, head->iov_len); + vec[0].iov_base = write->wr_head.iov_base; + vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); buflen -= vec[0].iov_len; while (buflen) { - vec[i].iov_base = page_address(pagelist[i - 1]); + vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); buflen -= vec[i].iov_len; i++; @@ -1161,8 +1162,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { int avail; int len; - struct page **pagelist; - struct kvec head; DECODE_HEAD; status = nfsd4_decode_stateid(argp, &write->wr_stateid); @@ -1185,10 +1184,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) __FILE__, __LINE__); goto xdr_error; } - head.iov_base = p; - head.iov_len = avail; + write->wr_head.iov_base = p; + write->wr_head.iov_len = avail; WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); - pagelist = argp->pagelist; + write->wr_pagelist = argp->pagelist; len = XDR_QUADLEN(write->wr_buflen) << 2; if (len >= avail) { @@ -1205,8 +1204,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } argp->p += XDR_QUADLEN(len); - write->wr_vlen = fill_in_write_vector(argp->rqstp->rq_vec, - &head, pagelist, write->wr_buflen); + write->wr_vlen = fill_in_write_vector(argp->rqstp->rq_vec, write); WARN_ON_ONCE(write->wr_vlen > ARRAY_SIZE(argp->rqstp->rq_vec)); DECODE_TAIL; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 3c414c1be295..152867b8125d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -386,6 +386,8 @@ struct nfsd4_write { u32 wr_stable_how; /* request */ u32 wr_buflen; /* request */ int wr_vlen; + struct kvec wr_head; + struct page ** wr_pagelist; /* request */ u32 wr_bytes_written; /* response */ u32 wr_how_written; /* response */ -- cgit v1.2.1 From ffe1137ba743cdf1c2414d5a89690aec1daa6bba Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Nov 2012 14:52:19 -0500 Subject: nfsd4: delay filling in write iovec array till after xdr decoding Our server rejects compounds containing more than one write operation. It's unclear whether this is really permitted by the spec; with 4.0, it's possibly OK, with 4.1 (which has clearer limits on compound parameters), it's probably not OK. No client that we're aware of has ever done this, but in theory it could be useful. The source of the limitation: we need an array of iovecs to pass to the write operation. In the worst case that array of iovecs could have hundreds of elements (the maximum rwsize divided by the page size), so it's too big to put on the stack, or in each compound op. So we instead keep a single such array in the compound argument. We fill in that array at the time we decode the xdr operation. But we decode every op in the compound before executing any of them. So once we've used that array we can't decode another write. If we instead delay filling in that array till the time we actually perform the write, we can reuse it. Another option might be to switch to decoding compound ops one at a time. I considered doing that, but it has a number of other side effects, and I'd rather fix just this one problem for now. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 24 +++++++++++++++++++++++- fs/nfsd/nfs4xdr.c | 20 -------------------- fs/nfsd/xdr4.h | 1 - 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1d2396b79574..87d24e5f3ca4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -881,6 +881,24 @@ out: return status; } +static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) +{ + int i = 1; + int buflen = write->wr_buflen; + + vec[0].iov_base = write->wr_head.iov_base; + vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); + buflen -= vec[0].iov_len; + + while (buflen) { + vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); + vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); + buflen -= vec[i].iov_len; + i++; + } + return i; +} + static __be32 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_write *write) @@ -889,6 +907,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file *filp = NULL; __be32 status = nfs_ok; unsigned long cnt; + int nvecs; /* no need to check permission - this will be done in nfsd_write() */ @@ -911,8 +930,11 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; gen_boot_verifier(&write->wr_verifier); + nvecs = fill_in_write_vector(rqstp->rq_vec, write); + WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); + status = nfsd_write(rqstp, &cstate->current_fh, filp, - write->wr_offset, rqstp->rq_vec, write->wr_vlen, + write->wr_offset, rqstp->rq_vec, nvecs, &cnt, &write->wr_how_written); if (filp) fput(filp); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index cb9f9017af8f..09204f590355 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1139,24 +1139,6 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify DECODE_TAIL; } -static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write) -{ - int i = 1; - int buflen = write->wr_buflen; - - vec[0].iov_base = write->wr_head.iov_base; - vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len); - buflen -= vec[0].iov_len; - - while (buflen) { - vec[i].iov_base = page_address(write->wr_pagelist[i - 1]); - vec[i].iov_len = min_t(int, PAGE_SIZE, buflen); - buflen -= vec[i].iov_len; - i++; - } - return i; -} - static __be32 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { @@ -1204,8 +1186,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } argp->p += XDR_QUADLEN(len); - write->wr_vlen = fill_in_write_vector(argp->rqstp->rq_vec, write); - WARN_ON_ONCE(write->wr_vlen > ARRAY_SIZE(argp->rqstp->rq_vec)); DECODE_TAIL; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 152867b8125d..331f8a3277ab 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -385,7 +385,6 @@ struct nfsd4_write { u64 wr_offset; /* request */ u32 wr_stable_how; /* request */ u32 wr_buflen; /* request */ - int wr_vlen; struct kvec wr_head; struct page ** wr_pagelist; /* request */ -- cgit v1.2.1 From 063b0fb9fadadc0caaea6c8f31e3f6bc978a4904 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 25 Nov 2012 14:48:10 -0500 Subject: nfsd4: downgrade some fs/nfsd/nfs4state.c BUG's Linus has pointed out that indiscriminate use of BUG's can make it harder to diagnose bugs because they can bring a machine down, often before we manage to get any useful debugging information to the logs. (Consider, for example, a BUG() that fires in a workqueue, or while holding a spinlock). Most of these BUG's won't do much more than kill an nfsd thread, but it would still probably be safer to get out the warning without dying. There's still more of this to do in nfsd/. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e75872f81e1c..41d2aed8ed06 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -190,7 +190,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { - BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); + WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); atomic_inc(&fp->fi_access[oflag]); } @@ -249,7 +249,7 @@ static inline int get_new_stid(struct nfs4_stid *stid) * preallocations that can exist at a time, but the state lock * prevents anyone from using ours before we get here: */ - BUG_ON(error); + WARN_ON_ONCE(error); /* * It shouldn't be a problem to reuse an opaque stateid value. * I don't think it is for 4.1. But with 4.0 I worry that, for @@ -494,7 +494,8 @@ static int nfs4_access_to_omode(u32 access) case NFS4_SHARE_ACCESS_BOTH: return O_RDWR; } - BUG(); + WARN_ON_ONCE(1); + return O_RDONLY; } /* release all access and file references for a given stateid */ @@ -1605,10 +1606,9 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, switch (exid->spa_how) { case SP4_NONE: break; + default: /* checked by xdr code */ + WARN_ON_ONCE(1); case SP4_SSV: - return nfserr_serverfault; - default: - BUG(); /* checked by xdr code */ case SP4_MACH_CRED: return nfserr_serverfault; /* no excuse :-/ */ } @@ -2912,7 +2912,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) open->op_why_no_deleg = WND4_CANCELLED; break; case NFS4_SHARE_WANT_NO_DELEG: - BUG(); /* not supposed to get here */ + WARN_ON_ONCE(1); } } } @@ -3466,7 +3466,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, goto out; if (filpp) { *filpp = dp->dl_file->fi_deleg_file; - BUG_ON(!*filpp); + if (!*filpp) { + WARN_ON_ONCE(1); + status = nfserr_serverfault; + goto out; + } } break; case NFS4_OPEN_STID: @@ -3693,7 +3697,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac case NFS4_SHARE_ACCESS_BOTH: break; default: - BUG(); + WARN_ON_ONCE(1); } } @@ -3882,7 +3886,7 @@ last_byte_offset(u64 start, u64 len) { u64 end; - BUG_ON(!len); + WARN_ON_ONCE(!len); end = start + len; return end > start ? end - 1: NFS4_MAX_UINT64; } @@ -4552,7 +4556,7 @@ nfs4_release_reclaim(struct nfsd_net *nn) nfs4_remove_reclaim_record(crp, nn); } } - BUG_ON(nn->reclaim_str_hashtbl_size); + WARN_ON_ONCE(nn->reclaim_str_hashtbl_size); } /* -- cgit v1.2.1 From 7c4cebe8e02dd0b0e655605442bbe9268db9ed4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 23 Nov 2012 14:24:23 +1100 Subject: xfs: inode allocation should use unmapped buffers. Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 2d6495eaaa34..a815412eab80 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,7 +200,8 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!fbuf) return ENOMEM; /* -- cgit v1.2.1 From e3725ec015dfbbeb896295cf2b3a995f28b0630e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Nov 2012 12:25:01 -0500 Subject: NFSv4.1: Shrink struct nfs4_sequence_res by moving the session pointer Move the session pointer into the slot table, then have struct nfs4_slot point to that slot table. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4proc.c | 33 +++++++++++++++++++++++---------- fs/nfs/nfs4state.c | 2 +- fs/nfs/nfs4xdr.c | 8 +++++--- 4 files changed, 31 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 36880b9aa91e..42c58691fb41 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -258,7 +258,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); -extern struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags); +extern struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, + u32 max_slots, gfp_t gfp_flags); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 14b39742b6e4..5b61c4a83191 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -467,25 +467,28 @@ void nfs4_check_drain_bc_complete(struct nfs4_session *ses) static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { + struct nfs4_session *session; struct nfs4_slot_table *tbl; - tbl = &res->sr_session->fc_slot_table; if (!res->sr_slot) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ dprintk("%s: No slot\n", __func__); return; } + tbl = res->sr_slot->table; + session = tbl->session; spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slot - tbl->slots); - nfs4_check_drain_fc_complete(res->sr_session); + nfs4_check_drain_fc_complete(session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; } static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { + struct nfs4_session *session; struct nfs4_slot *slot; unsigned long timestamp; struct nfs_client *clp; @@ -504,6 +507,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * goto out; slot = res->sr_slot; + session = slot->table->session; /* Check the SEQUENCE operation status */ switch (res->sr_status) { @@ -511,7 +515,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = slot->renewal_time; - clp = res->sr_session->clp; + clp = session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ if (res->sr_status_flags != 0) @@ -524,7 +528,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * */ dprintk("%s: slot=%td seq=%d: Operation in progress\n", __func__, - slot - res->sr_session->fc_slot_table.slots, + slot - session->fc_slot_table.slots, slot->seq_nr); goto out_retry; default: @@ -546,7 +550,7 @@ out_retry: static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { - if (res->sr_session == NULL) + if (res->sr_slot == NULL) return 1; return nfs41_sequence_done(task, res); } @@ -591,7 +595,6 @@ static void nfs41_init_sequence(struct nfs4_sequence_args *args, args->sa_cache_this = 0; if (cache_reply) args->sa_cache_this = 1; - res->sr_session = NULL; res->sr_slot = NULL; } @@ -646,7 +649,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); - res->sr_session = session; res->sr_slot = slot; res->sr_status_flags = 0; /* @@ -5659,9 +5661,18 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } -struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, + u32 max_slots, gfp_t gfp_flags) { - return kmalloc_array(max_slots, sizeof(struct nfs4_slot), gfp_flags); + struct nfs4_slot *tbl; + u32 i; + + tbl = kmalloc_array(max_slots, sizeof(*tbl), gfp_flags); + if (tbl != NULL) { + for (i = 0; i < max_slots; i++) + tbl[i].table = table; + } + return tbl; } static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, @@ -5699,7 +5710,7 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, /* Does the newly negotiated max_reqs match the existing slot table? */ if (max_reqs != tbl->max_slots) { - new = nfs4_alloc_slots(max_reqs, GFP_NOFS); + new = nfs4_alloc_slots(tbl, max_reqs, GFP_NOFS); if (!new) goto out; } @@ -5738,11 +5749,13 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) dprintk("--> %s\n", __func__); /* Fore channel */ tbl = &ses->fc_slot_table; + tbl->session = ses; status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); if (status) /* -ENOMEM */ return status; /* Back channel */ tbl = &ses->bc_slot_table; + tbl->session = ses; status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); if (status && tbl->slots == NULL) /* Fore and back channel share a connection so get diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 96fcbb97fd6a..9495789c425b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2033,7 +2033,7 @@ static int nfs4_recall_slot(struct nfs_client *clp) return 0; nfs4_begin_drain_session(clp); fc_tbl = &clp->cl_session->fc_slot_table; - new = nfs4_alloc_slots(fc_tbl->target_max_slots, GFP_NOFS); + new = nfs4_alloc_slots(fc_tbl, fc_tbl->target_max_slots, GFP_NOFS); if (!new) return -ENOMEM; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 672d9b0ef2c5..4126f054610a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5507,12 +5507,13 @@ static int decode_sequence(struct xdr_stream *xdr, struct rpc_rqst *rqstp) { #if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; struct nfs4_sessionid id; u32 dummy; int status; __be32 *p; - if (!res->sr_session) + if (res->sr_slot == NULL) return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); @@ -5526,8 +5527,9 @@ static int decode_sequence(struct xdr_stream *xdr, * sequence number, the server is looney tunes. */ status = -EREMOTEIO; + session = res->sr_slot->table->session; - if (memcmp(id.data, res->sr_session->sess_id.data, + if (memcmp(id.data, session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; @@ -5545,7 +5547,7 @@ static int decode_sequence(struct xdr_stream *xdr, } /* slot id */ dummy = be32_to_cpup(p++); - if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { + if (dummy != res->sr_slot - session->fc_slot_table.slots) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } -- cgit v1.2.1 From df2fabffbace8988f3265585ec793ff9deccdea7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Nov 2012 12:45:06 -0500 Subject: NFSv4.1: Label each entry in the session slot tables with its slot number Instead of doing slot table pointer gymnastics every time we want to know which slot we're using. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++----- fs/nfs/nfs4xdr.c | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5b61c4a83191..4311dba49c58 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -526,9 +526,9 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * returned NFS4ERR_DELAY as per Section 2.10.6.2 * of RFC5661. */ - dprintk("%s: slot=%td seq=%d: Operation in progress\n", + dprintk("%s: slot=%u seq=%u: Operation in progress\n", __func__, - slot - session->fc_slot_table.slots, + slot->slot_nr, slot->seq_nr); goto out_retry; default: @@ -671,9 +671,9 @@ int nfs4_setup_sequence(const struct nfs_server *server, if (session == NULL) goto out; - dprintk("--> %s clp %p session %p sr_slot %td\n", + dprintk("--> %s clp %p session %p sr_slot %d\n", __func__, session->clp, session, res->sr_slot ? - res->sr_slot - session->fc_slot_table.slots : -1); + res->sr_slot->slot_nr : -1); ret = nfs41_setup_sequence(session, args, res, task); out: @@ -5669,8 +5669,10 @@ struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, tbl = kmalloc_array(max_slots, sizeof(*tbl), gfp_flags); if (tbl != NULL) { - for (i = 0; i < max_slots; i++) + for (i = 0; i < max_slots; i++) { tbl[i].table = table; + tbl[i].slot_nr = i; + } } return tbl; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4126f054610a..50bac7066160 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5547,7 +5547,7 @@ static int decode_sequence(struct xdr_stream *xdr, } /* slot id */ dummy = be32_to_cpup(p++); - if (dummy != res->sr_slot - session->fc_slot_table.slots) { + if (dummy != res->sr_slot->slot_nr) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } -- cgit v1.2.1 From 2b2fa71723f955d5b4a0f4edd99cf3cd69ceafd1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Nov 2012 12:58:36 -0500 Subject: NFSv4.1: Simplify struct nfs4_sequence_args too Replace the session pointer + slotid with a pointer to the allocated slot. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++--- fs/nfs/nfs4xdr.c | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4311dba49c58..6c41a34e34b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -591,7 +591,7 @@ out: static void nfs41_init_sequence(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - args->sa_session = NULL; + args->sa_slot = NULL; args->sa_cache_this = 0; if (cache_reply) args->sa_cache_this = 1; @@ -644,8 +644,8 @@ int nfs41_setup_sequence(struct nfs4_session *session, rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); slot = tbl->slots + slotid; slot->renewal_time = jiffies; - args->sa_session = session; - args->sa_slotid = slotid; + + args->sa_slot = slot; dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 50bac7066160..27b0fec1a6b0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1833,18 +1833,16 @@ static void encode_sequence(struct xdr_stream *xdr, struct compound_hdr *hdr) { #if defined(CONFIG_NFS_V4_1) - struct nfs4_session *session = args->sa_session; + struct nfs4_session *session; struct nfs4_slot_table *tp; - struct nfs4_slot *slot; + struct nfs4_slot *slot = args->sa_slot; __be32 *p; - if (!session) + if (slot == NULL) return; - tp = &session->fc_slot_table; - - WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); - slot = tp->slots + args->sa_slotid; + tp = slot->table; + session = tp->session; encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); @@ -1858,12 +1856,12 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[1], ((u32 *)session->sess_id.data)[2], ((u32 *)session->sess_id.data)[3], - slot->seq_nr, args->sa_slotid, + slot->seq_nr, slot->slot_nr, tp->highest_used_slotid, args->sa_cache_this); p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(slot->seq_nr); - *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(slot->slot_nr); *p++ = cpu_to_be32(tp->highest_used_slotid); *p = cpu_to_be32(args->sa_cache_this); #endif /* CONFIG_NFS_V4_1 */ @@ -2025,8 +2023,9 @@ static void encode_free_stateid(struct xdr_stream *xdr, static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) { #if defined(CONFIG_NFS_V4_1) - if (args->sa_session) - return args->sa_session->clp->cl_mvops->minor_version; + + if (args->sa_slot) + return args->sa_slot->table->session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } -- cgit v1.2.1 From 2dc03b7f00d7fcd7dbb9302c5ebbd0c2b7fa3557 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 16 Nov 2012 16:10:11 -0500 Subject: NFSv4.1: Simplify slot allocation Clean up the NFSv4.1 slot allocation by replacing nfs_find_slot() with a function nfs_alloc_slot() that returns a pointer to the nfs4_slot instead of an offset into the slot table. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6c41a34e34b4..0789ef18a94d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -556,20 +556,18 @@ static int nfs4_sequence_done(struct rpc_task *task, } /* - * nfs4_find_slot - efficiently look for a free slot + * nfs4_alloc_slot - efficiently look for a free slot * - * nfs4_find_slot looks for an unset bit in the used_slots bitmap. + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. * If found, we mark the slot as used, update the highest_used_slotid, * and respectively set up the sequence operation args. - * The slot number is returned if found, or NFS4_NO_SLOT otherwise. * * Note: must be called with under the slot_tbl_lock. */ -static u32 -nfs4_find_slot(struct nfs4_slot_table *tbl) +static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) { + struct nfs4_slot *ret = NULL; u32 slotid; - u32 ret_id = NFS4_NO_SLOT; dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, @@ -581,11 +579,14 @@ nfs4_find_slot(struct nfs4_slot_table *tbl) if (slotid > tbl->highest_used_slotid || tbl->highest_used_slotid == NFS4_NO_SLOT) tbl->highest_used_slotid = slotid; - ret_id = slotid; + ret = &tbl->slots[slotid]; + ret->renewal_time = jiffies; + out: dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", - __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id); - return ret_id; + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + ret ? ret->slot_nr : -1); + return ret; } static void nfs41_init_sequence(struct nfs4_sequence_args *args, @@ -605,7 +606,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, { struct nfs4_slot *slot; struct nfs4_slot_table *tbl; - u32 slotid; dprintk("--> %s\n", __func__); /* slot already allocated? */ @@ -632,8 +632,8 @@ int nfs41_setup_sequence(struct nfs4_session *session, return -EAGAIN; } - slotid = nfs4_find_slot(tbl); - if (slotid == NFS4_NO_SLOT) { + slot = nfs4_alloc_slot(tbl); + if (slot == NULL) { rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); dprintk("<-- %s: no free slots\n", __func__); @@ -642,12 +642,11 @@ int nfs41_setup_sequence(struct nfs4_session *session, spin_unlock(&tbl->slot_tbl_lock); rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); - slot = tbl->slots + slotid; - slot->renewal_time = jiffies; args->sa_slot = slot; - dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); + dprintk("<-- %s slotid=%d seqid=%d\n", __func__, + slot->slot_nr, slot->seq_nr); res->sr_slot = slot; res->sr_status_flags = 0; -- cgit v1.2.1 From f4af6e2abc8efb1695203a2b76876edf80f79960 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 14:17:32 -0500 Subject: NFSv4.1: Clean up nfs4_free_slot Change the argument to take the pointer to the slot, instead of just the slotid. We know that the new value of highest_used_slot must be less than the current value. No need to scan the whole table. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0789ef18a94d..197ef3e4e1f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -412,16 +412,18 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid) +nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) { + u32 slotid = slot->slot_nr; + /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); /* update highest_used_slotid when it is freed */ if (slotid == tbl->highest_used_slotid) { - slotid = find_last_bit(tbl->used_slots, tbl->max_slots); - if (slotid < tbl->max_slots) - tbl->highest_used_slotid = slotid; + u32 new_max = find_last_bit(tbl->used_slots, slotid); + if (new_max < slotid) + tbl->highest_used_slotid = new_max; else tbl->highest_used_slotid = NFS4_NO_SLOT; } @@ -480,7 +482,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) session = tbl->session; spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slot - tbl->slots); + nfs4_free_slot(tbl, res->sr_slot); nfs4_check_drain_fc_complete(session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; -- cgit v1.2.1 From a9efd39cd547223597cfe7c53acec44c099b9264 Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Wed, 14 Nov 2012 20:27:28 +0000 Subject: efi_pstore: Add ctime to argument of erase callback [Issue] Currently, a variable name, which is used to identify each log entry, consists of type, id and ctime. But an erase callback does not use ctime. If efi_pstore supported just one log, type and id were enough. However, in case of supporting multiple logs, it doesn't work because it can't distinguish each entry without ctime at erasing time. As you can see below, efi_pstore can't differentiate first event from second one without ctime. a variable name of first event: dump-type0-1-12345678 a variable name of second event: dump-type0-1-23456789 type:0 id:1 ctime:12345678, 23456789 [Solution] This patch adds ctime to an argument of an erase callback. It works across reboots because ctime of pstore means the date that the record was originally stored. To do this, efi_pstore saves the ctime to variable name at writing time and passes it to pstore at reading time. Signed-off-by: Seiji Aguchi Acked-by: Mike Waychison Signed-off-by: Tony Luck --- fs/pstore/inode.c | 3 ++- fs/pstore/ram.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 4ab572e6d277..4300af654710 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -175,7 +175,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) struct pstore_private *p = dentry->d_inode->i_private; if (p->psi->erase) - p->psi->erase(p->type, p->id, p->psi); + p->psi->erase(p->type, p->id, dentry->d_inode->i_ctime, + p->psi); return simple_unlink(dir, dentry); } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 1a4f6da58eab..749693fcb75a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -237,7 +237,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, } static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, - struct pstore_info *psi) + struct timespec time, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; -- cgit v1.2.1 From 755d4fe46529018ae45bc7c86df682de45ace764 Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Mon, 26 Nov 2012 16:07:44 -0800 Subject: efi_pstore: Add a sequence counter to a variable name [Issue] Currently, a variable name, which identifies each entry, consists of type, id and ctime. But if multiple events happens in a short time, a second/third event may fail to log because efi_pstore can't distinguish each event with current variable name. [Solution] A reasonable way to identify all events precisely is introducing a sequence counter to the variable name. The sequence counter has already supported in a pstore layer with "oopscount". So, this patch adds it to a variable name. Also, it is passed to read/erase callbacks of platform drivers in accordance with the modification of the variable name. a variable name of first event: dump-type0-1-12345678 a variable name of second event: dump-type0-1-12345678 type:0 id:1 ctime:12345678 If multiple events happen in a short time, efi_pstore can't distinguish them because variable names are same among them. it can be distinguishable by adding a sequence counter as follows. a variable name of first event: dump-type0-1-1-12345678 a variable name of Second event: dump-type0-1-2-12345678 type:0 id:1 sequence counter: 1(first event), 2(second event) ctime:12345678 In case of a write callback executed in pstore_console_write(), "0" is added to an argument of the write callback because it just logs all kernel messages and doesn't need to care about multiple events. Signed-off-by: Seiji Aguchi Acked-by: Rafael J. Wysocki Acked-by: Mike Waychison Signed-off-by: Tony Luck --- fs/pstore/inode.c | 8 +++++--- fs/pstore/internal.h | 2 +- fs/pstore/platform.c | 13 +++++++------ fs/pstore/ram.c | 7 +++---- 4 files changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 4300af654710..ed1d8c7212da 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -49,6 +49,7 @@ struct pstore_private { struct pstore_info *psi; enum pstore_type_id type; u64 id; + int count; ssize_t size; char data[]; }; @@ -175,8 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) struct pstore_private *p = dentry->d_inode->i_private; if (p->psi->erase) - p->psi->erase(p->type, p->id, dentry->d_inode->i_ctime, - p->psi); + p->psi->erase(p->type, p->id, p->count, + dentry->d_inode->i_ctime, p->psi); return simple_unlink(dir, dentry); } @@ -271,7 +272,7 @@ int pstore_is_mounted(void) * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ -int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, +int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, char *data, size_t size, struct timespec time, struct pstore_info *psi) { @@ -307,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, goto fail_alloc; private->type = type; private->id = id; + private->count = count; private->psi = psi; switch (type) { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 4847f588b7d5..937d820f273c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -50,7 +50,7 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - char *data, size_t size, + int count, char *data, size_t size, struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 947fbe06c3b1..5ea2e77ff023 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, break; ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - hsize + len, psinfo); + oopscount, hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -173,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -197,7 +197,7 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, - u64 *id, unsigned int part, + u64 *id, unsigned int part, int count, size_t size, struct pstore_info *psi) { return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); @@ -267,6 +267,7 @@ void pstore_get_records(int quiet) char *buf = NULL; ssize_t size; u64 id; + int count; enum pstore_type_id type; struct timespec time; int failed = 0, rc; @@ -278,9 +279,9 @@ void pstore_get_records(int quiet) if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { - rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, - time, psi); + while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) { + rc = pstore_mkfile(type, psi->name, id, count, buf, + (size_t)size, time, psi); kfree(buf); buf = NULL; if (rc && (rc != -EEXIST || !quiet)) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 749693fcb75a..2bfa36e0ffe8 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, } static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, - struct timespec *time, - char **buf, - struct pstore_info *psi) + int *count, struct timespec *time, + char **buf, struct pstore_info *psi) { ssize_t size; struct ramoops_context *cxt = psi->data; @@ -236,7 +235,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } -static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, +static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; -- cgit v1.2.1 From 1f20dfdaedcec4298a0a71fd396ec4828b332483 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 18 Nov 2012 21:27:50 -0800 Subject: sysfs: Mark sysfs_attr_ns static Nothing outside of fs/sysfs/file.c references this function, so mark it static. Signed-off-by: Josh Triplett Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 00012e31829d..602f56db0442 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; -int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, - const void **pns) +static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) { struct sysfs_dirent *dir_sd = kobj->sd; const struct sysfs_ops *ops; -- cgit v1.2.1 From 05f564849d49499ced97913a0914b5950577d07d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 16:29:42 -0800 Subject: proc: check vma->vm_file before dereferencing Commit 7b540d0646ce ("proc_map_files_readdir(): don't bother with grabbing files") switched proc_map_files_readdir() to use @f_mode directly instead of grabbing @file reference, but same time the test for @vm_file presence was lost leading to nil dereference. The patch brings the test back. The all proc_map_files feature is CONFIG_CHECKPOINT_RESTORE wrapped (which is set to 'n' by default) so the bug doesn't affect regular kernels. The regression is 3.7-rc1 only as far as I can tell. [gorcunov@openvz.org: provided changelog] Signed-off-by: Stanislav Kinsbursky Acked-by: Cyrill Gorcunov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3c231adf8450..9e28356a959a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1877,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, - (void *)(unsigned long)vma->vm_file->f_mode); + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); -- cgit v1.2.1 From 4eff96dd5283a102e0c1cac95247090be74a38ed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Nov 2012 16:29:51 -0800 Subject: writeback: put unused inodes to LRU after writeback completion Commit 169ebd90131b ("writeback: Avoid iput() from flusher thread") removed iget-iput pair from inode writeback. As a side effect, inodes that are dirty during iput_final() call won't be ever added to inode LRU (iput_final() doesn't add dirty inodes to LRU and later when the inode is cleaned there's noone to add the inode there). Thus inodes are effectively unreclaimable until someone looks them up again. The practical effect of this bug is limited by the fact that inodes are pinned by a dentry for long enough that the inode gets cleaned. But still the bug can have nasty consequences leading up to OOM conditions under certain circumstances. Following can easily reproduce the problem: for (( i = 0; i < 1000; i++ )); do mkdir $i for (( j = 0; j < 1000; j++ )); do touch $i/$j echo 2 > /proc/sys/vm/drop_caches done done then one needs to run 'sync; ls -lR' to make inodes reclaimable again. We fix the issue by inserting unused clean inodes into the LRU after writeback finishes in inode_sync_complete(). Signed-off-by: Jan Kara Reported-by: OGAWA Hirofumi Cc: Al Viro Cc: OGAWA Hirofumi Cc: Wu Fengguang Cc: Dave Chinner Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 ++ fs/inode.c | 16 ++++++++++++++-- fs/internal.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 51ea267d444c..3e3422f7f0a4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; + /* If inode is clean an unused, put it into LRU now... */ + inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); diff --git a/fs/inode.c b/fs/inode.c index b03c71957246..64999f144153 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode) spin_unlock(&inode->i_sb->s_inode_lru_lock); } +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + inode_lru_list_add(inode); +} + + static void inode_lru_list_del(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_lru_lock); @@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode) if (!drop && (sb->s_flags & MS_ACTIVE)) { inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - inode_lru_list_add(inode); + inode_add_lru(inode); spin_unlock(&inode->i_lock); return; } diff --git a/fs/internal.h b/fs/internal.h index 916b7cbf3e3e..2f6af7f645eb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; +extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c -- cgit v1.2.1 From 3a98b8614312026d489e56c1d0e294a68e2aad77 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Nov 2012 09:48:41 -0500 Subject: cifs: fix writeback race with file that is growing Commit eddb079deb4 created a regression in the writepages codepath. Previously, whenever it needed to check the size of the file, it did so by consulting the inode->i_size field directly. With that patch, the i_size was fetched once on entry into the writepages code and that value was used henceforth. If the file is changing size though (for instance, if someone is writing to it or has truncated it), then that value is likely to be wrong. This can lead to data corruption. Pages past the EOF at the time that the writepages call was issued may be silently dropped and ignored because cifs_writepages wrongly assumes that the file must have been truncated in the interim. Fix cifs_writepages to properly fetch the size from the inode->i_size field instead to properly account for this possibility. Original bug report is here: https://bugzilla.kernel.org/show_bug.cgi?id=50991 Reported-and-Tested-by: Maxim Britov Reviewed-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index edb25b4bbb95..70b6f4c3a0c1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1794,7 +1794,6 @@ static int cifs_writepages(struct address_space *mapping, struct TCP_Server_Info *server; struct page *page; int rc = 0; - loff_t isize = i_size_read(mapping->host); /* * If wsize is smaller than the page cache size, default to writing @@ -1899,7 +1898,7 @@ retry: */ set_page_writeback(page); - if (page_offset(page) >= isize) { + if (page_offset(page) >= i_size_read(mapping->host)) { done = true; unlock_page(page); end_page_writeback(page); @@ -1932,7 +1931,8 @@ retry: wdata->offset = page_offset(wdata->pages[0]); wdata->pagesz = PAGE_CACHE_SIZE; wdata->tailsz = - min(isize - page_offset(wdata->pages[nr_pages - 1]), + min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), (loff_t)PAGE_CACHE_SIZE); wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; -- cgit v1.2.1 From a36b1725b342c8131a86a0238789d8e7bcb490dd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 25 Nov 2012 16:31:00 -0500 Subject: nfsd4: return badname, not inval, on "." or "..", or "/" The spec requires badname, not inval, in these cases. Some callers want us to return enoent, but I can see no justification for that. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 09204f590355..250171c5c311 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -65,17 +65,17 @@ #define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL static __be32 -check_filename(char *str, int len, __be32 err) +check_filename(char *str, int len) { int i; if (len == 0) return nfserr_inval; if (isdotent(str, len)) - return err; + return nfserr_badname; for (i = 0; i < len; i++) if (str[i] == '/') - return err; + return nfserr_badname; return 0; } @@ -570,7 +570,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create READ32(create->cr_namelen); READ_BUF(create->cr_namelen); SAVEMEM(create->cr_name, create->cr_namelen); - if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) + if ((status = check_filename(create->cr_name, create->cr_namelen))) return status; status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, @@ -602,7 +602,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) READ32(link->li_namelen); READ_BUF(link->li_namelen); SAVEMEM(link->li_name, link->li_namelen); - if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval))) + if ((status = check_filename(link->li_name, link->li_namelen))) return status; DECODE_TAIL; @@ -696,7 +696,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup READ32(lookup->lo_len); READ_BUF(lookup->lo_len); SAVEMEM(lookup->lo_name, lookup->lo_len); - if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent))) + if ((status = check_filename(lookup->lo_name, lookup->lo_len))) return status; DECODE_TAIL; @@ -860,7 +860,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) return status; break; case NFS4_OPEN_CLAIM_PREVIOUS: @@ -875,7 +875,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) return status; break; case NFS4_OPEN_CLAIM_FH: @@ -987,7 +987,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove READ32(remove->rm_namelen); READ_BUF(remove->rm_namelen); SAVEMEM(remove->rm_name, remove->rm_namelen); - if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent))) + if ((status = check_filename(remove->rm_name, remove->rm_namelen))) return status; DECODE_TAIL; @@ -1005,9 +1005,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename READ32(rename->rn_tnamelen); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); - if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent))) + if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) return status; - if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval))) + if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) return status; DECODE_TAIL; @@ -1034,8 +1034,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, READ32(secinfo->si_namelen); READ_BUF(secinfo->si_namelen); SAVEMEM(secinfo->si_name, secinfo->si_namelen); - status = check_filename(secinfo->si_name, secinfo->si_namelen, - nfserr_noent); + status = check_filename(secinfo->si_name, secinfo->si_namelen); if (status) return status; DECODE_TAIL; -- cgit v1.2.1 From dba88ba55a06ff8bef467f2ca3f7904aeab8762a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Nov 2012 11:45:12 -0500 Subject: nfsd4: remove state lock from nfsd4_load_reboot_recovery_data That function is only called under nfsd_mutex: we know that because the only caller is nfsd_svc, via nfsd_svc nfsd_startup nfs4_state_start nfsd4_client_tracking_init client_tracking_ops->init == nfsd4_load_reboot_recovery_data The shared state accessed here includes: - user_recovery_dirname: used here, modified only by nfs4_reset_recoverydir, which can be verified to only be called under nfsd_mutex. - filesystem state, protected by i_mutex (handwaving slightly here) - rec_file, reclaim_str_hashtbl, reclaim_str_hashtbl_size: other than here, used only from code called from nfsd or laundromat threads, both of which should be started only after this runs (see nfsd_svc) and stopped before this could run again (see nfsd_shutdown, called from nfsd_last_thread). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b657b622bf5d..651d5134e74c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -509,11 +509,9 @@ nfsd4_load_reboot_recovery_data(struct net *net) { int status; - nfs4_lock_state(); status = nfsd4_init_recdir(); if (!status) status = nfsd4_recdir_load(net); - nfs4_unlock_state(); if (status) printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); return status; -- cgit v1.2.1 From ec28e02ca5f2a4287c19c585f8be2d9b3ba123ea Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Wed, 21 Nov 2012 18:07:38 +0300 Subject: nfsd4: remove state lock from nfs4_state_shutdown Protection of __nfs4_state_shutdown() with nfs4_lock_state() looks redundant. This function is called by the last NFSd thread on it's exit and state lock protects actually two functions (del_recall_lru is protected by recall_lock): 1) nfsd4_client_tracking_exit 2) __nfs4_state_shutdown_net "nfsd4_client_tracking_exit" doesn't require state lock protection, because it's state can be modified only by tracker callbacks. Here a re they: 1) create: is called only from nfsd4_proc_compound. 2) remove: is called from either nfsd4_proc_compound or nfs4_laundromat. 3) check: is called only from nfsd4_proc_compound. 4) grace_done; called only from nfs4_laundromat. nfsd4_proc_compound is called onll by NFSd kthread, which is exiting right now. nfs4_laundromat is called by laundry_wq. But laundromat_work was canceled already. "__nfs4_state_shutdown_net" also doesn't require state lock protection, because all NFSd kthreads are dead, and no race can happen with NFSd start, because "nfsd_up" flag is still set. Moreover, all Nfsd shutdown is protected with global nfsd_mutex. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 41d2aed8ed06..ffec73cdfaca 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4930,9 +4930,7 @@ nfs4_state_shutdown(void) cancel_delayed_work_sync(&nn->laundromat_work); destroy_workqueue(laundry_wq); locks_end_grace(&nn->nfsd4_manager); - nfs4_lock_state(); __nfs4_state_shutdown(net); - nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } -- cgit v1.2.1 From c9a4962881929df7f1ef6e63e1b9da304faca4dd Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 15:21:58 +0300 Subject: nfsd: make client_lock per net This lock protects the client lru list and session hash table, which are allocated per network namespace already. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 3 +++ fs/nfsd/nfs4state.c | 73 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 46 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 227b93ebb622..08d5fa1ce82a 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -81,6 +81,9 @@ struct nfsd_net { struct list_head close_lru; struct delayed_work laundromat_work; + + /* client_lock protects the client lru list and session hash table */ + spinlock_t client_lock; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ffec73cdfaca..0e7e174de209 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -388,9 +388,6 @@ unhash_delegation(struct nfs4_delegation *dp) * SETCLIENTID state */ -/* client_lock protects the client lru list and session hash table */ -static DEFINE_SPINLOCK(client_lock); - static unsigned int clientid_hashval(u32 id) { return id & CLIENT_HASH_MASK; @@ -872,18 +869,23 @@ static void __free_session(struct nfsd4_session *ses) static void free_session(struct kref *kref) { struct nfsd4_session *ses; + struct nfsd_net *nn; - lockdep_assert_held(&client_lock); ses = container_of(kref, struct nfsd4_session, se_ref); + nn = net_generic(ses->se_client->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); nfsd4_del_conns(ses); __free_session(ses); } void nfsd4_put_session(struct nfsd4_session *ses) { - spin_lock(&client_lock); + struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); + + spin_lock(&nn->client_lock); nfsd4_put_session_locked(ses); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) @@ -927,12 +929,12 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru new->se_cb_sec = cses->cb_sec; kref_init(&new->se_ref); idx = hash_sessionid(&new->se_sessionid); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); @@ -1005,9 +1007,11 @@ renew_client_locked(struct nfs4_client *clp) static inline void renew_client(struct nfs4_client *clp) { - spin_lock(&client_lock); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); renew_client_locked(clp); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); } /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ @@ -1045,7 +1049,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { - lockdep_assert_held(&client_lock); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, @@ -1062,15 +1068,16 @@ void release_session_client(struct nfsd4_session *session) { struct nfs4_client *clp = session->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) + if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) return; if (is_client_expired(clp)) { free_client(clp); session->se_client = NULL; } else renew_client_locked(clp); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); } /* must be called under the client_lock */ @@ -1119,11 +1126,11 @@ destroy_client(struct nfs4_client *clp) rb_erase(&clp->cl_namenode, &nn->conf_name_tree); else rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); unhash_client_locked(clp); if (atomic_read(&clp->cl_refcount) == 0) free_client(clp); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); } static void expire_client(struct nfs4_client *clp) @@ -1274,6 +1281,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); clp = alloc_client(name); if (clp == NULL) @@ -1282,9 +1290,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { - spin_lock(&client_lock); + spin_lock(&nn->client_lock); free_client(clp); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); return NULL; } idr_init(&clp->cl_stateids); @@ -1873,11 +1881,12 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir) __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc) { struct nfsd4_session *session = cstate->session; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); session->se_cb_prog = bc->bc_cb_program; session->se_cb_sec = bc->bc_cb_sec; - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); nfsd4_probe_callback(session->se_client); @@ -1890,10 +1899,11 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, { __be32 status; struct nfsd4_conn *conn; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; - spin_lock(&client_lock); + spin_lock(&nn->client_lock); cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); /* Sorta weird: we only need the refcnt'ing because new_conn acquires * client_lock iself: */ @@ -1901,7 +1911,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, nfsd4_get_session(cstate->session); atomic_inc(&cstate->session->se_client->cl_refcount); } - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); if (!cstate->session) return nfserr_badsession; @@ -1929,6 +1939,7 @@ nfsd4_destroy_session(struct svc_rqst *r, { struct nfsd4_session *ses; __be32 status = nfserr_badsession; + struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); /* Notes: * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid @@ -1942,24 +1953,24 @@ nfsd4_destroy_session(struct svc_rqst *r, return nfserr_not_only_op; } dump_sessionid(__func__, &sessionid->sessionid); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); if (!ses) { - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); goto out; } unhash_session(ses); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); nfs4_lock_state(); nfsd4_probe_callback_sync(ses->se_client); nfs4_unlock_state(); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); nfsd4_del_conns(ses); nfsd4_put_session_locked(ses); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); status = nfs_ok; out: dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -2025,6 +2036,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (resp->opcnt != 1) return nfserr_sequence_pos; @@ -2037,7 +2049,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (!conn) return nfserr_jukebox; - spin_lock(&client_lock); + spin_lock(&nn->client_lock); status = nfserr_badsession; session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); if (!session) @@ -2113,7 +2125,7 @@ out: } } kfree(conn); - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); dprintk("%s: return %d\n", __func__, ntohl(status)); return status; } @@ -3191,7 +3203,7 @@ nfs4_laundromat(struct nfsd_net *nn) dprintk("NFSD: laundromat service - starting\n"); nfsd4_end_grace(nn); INIT_LIST_HEAD(&reaplist); - spin_lock(&client_lock); + spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -3208,7 +3220,7 @@ nfs4_laundromat(struct nfsd_net *nn) unhash_client_locked(clp); list_add(&clp->cl_lru, &reaplist); } - spin_unlock(&client_lock); + spin_unlock(&nn->client_lock); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); dprintk("NFSD: purging unused client (clientid %08x)\n", @@ -4796,6 +4808,7 @@ static int nfs4_state_start_net(struct net *net) nn->unconf_name_tree = RB_ROOT; INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); + spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); -- cgit v1.2.1 From 4e37a7c2075baa2a15a2ab90fcc44173888016ed Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 15:22:03 +0300 Subject: nfsd: make delegations shutdown network namespace aware NFSv4 delegations are stored in global list. But they are nfs4_client dependent, which is network namespace aware already. State shutdown and laundromat are done per network namespace as well. So, delegations unhash have to be done in network namespace context. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0e7e174de209..bc2fc9f076fc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3230,6 +3230,8 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&recall_lock); list_for_each_safe(pos, next, &del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) + continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { u = dp->dl_time - cutoff; if (test_val > u) @@ -4922,6 +4924,8 @@ __nfs4_state_shutdown(struct net *net) spin_lock(&recall_lock); list_for_each_safe(pos, next, &del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + if (dp->dl_stid.sc_client->net != net) + continue; list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); -- cgit v1.2.1 From 4dce0ac9069bbebfd34f890f599ccdb92fa76e9f Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 15:22:08 +0300 Subject: nfsd: cleanup NFSd state shutdown a bit This patch renames __nfs4_state_shutdown_net() into nfs4_state_shutdown_net(), __nfs4_state_shutdown() into nfs4_state_shutdown_net() and moves all network related shutdown operations to nfs4_state_shutdown_net(). Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bc2fc9f076fc..84a27a24b806 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4829,7 +4829,7 @@ err: } static void -__nfs4_state_shutdown_net(struct net *net) +nfs4_state_destroy_net(struct net *net) { int i; struct nfs4_client *clp = NULL; @@ -4857,6 +4857,7 @@ __nfs4_state_shutdown_net(struct net *net) kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); + put_net(net); } /* initialization to perform when the nfsd service is started: */ @@ -4906,19 +4907,20 @@ out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: nfsd4_client_tracking_exit(net); - __nfs4_state_shutdown_net(net); - put_net(net); + nfs4_state_destroy_net(net); return ret; } /* should be called with the state lock held */ static void -__nfs4_state_shutdown(struct net *net) +nfs4_state_shutdown_net(struct net *net) { struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - __nfs4_state_shutdown_net(net); + cancel_delayed_work_sync(&nn->laundromat_work); + locks_end_grace(&nn->nfsd4_manager); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); @@ -4935,19 +4937,16 @@ __nfs4_state_shutdown(struct net *net) } nfsd4_client_tracking_exit(net); - put_net(net); + nfs4_state_destroy_net(net); } void nfs4_state_shutdown(void) { struct net *net = &init_net; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - cancel_delayed_work_sync(&nn->laundromat_work); + nfs4_state_shutdown_net(net); destroy_workqueue(laundry_wq); - locks_end_grace(&nn->nfsd4_manager); - __nfs4_state_shutdown(net); nfsd4_destroy_callback_queue(); } -- cgit v1.2.1 From d85ed443052570b25ea4b5f5fa70c57e0129fbc4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 15:22:13 +0300 Subject: nfsd: cleanup NFSd state start a bit This patch renames nfs4_state_start_net() into nfs4_state_create_net(), where get_net() now performed. Also it introduces new nfs4_state_start_net(), which is now responsible for state creation and initializing all per-net data and which is now called from nfs4_state_start(). Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 59 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 84a27a24b806..6f5798623eb1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4770,7 +4770,7 @@ set_max_delegations(void) max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); } -static int nfs4_state_start_net(struct net *net) +static int nfs4_state_create_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; @@ -4813,6 +4813,7 @@ static int nfs4_state_start_net(struct net *net) spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + get_net(net); return 0; @@ -4860,37 +4861,35 @@ nfs4_state_destroy_net(struct net *net) put_net(net); } -/* initialization to perform when the nfsd service is started: */ - -int -nfs4_state_start(void) +static int +nfs4_state_start_net(struct net *net) { - struct net *net = &init_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - /* - * FIXME: For now, we hang most of the pernet global stuff off of - * init_net until nfsd is fully containerized. Eventually, we'll - * need to pass a net pointer into this function, take a reference - * to that instead and then do most of the rest of this on a per-net - * basis. - */ - get_net(net); - ret = nfs4_state_start_net(net); + ret = nfs4_state_create_net(net); if (ret) return ret; nfsd4_client_tracking_init(net); nn->boot_time = get_seconds(); locks_start_grace(net, &nn->nfsd4_manager); nn->grace_ended = false; - printk(KERN_INFO "NFSD: starting %ld-second grace period\n", - nfsd4_grace); + printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", + nfsd4_grace, net); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nfsd4_grace * HZ); + return 0; +} + +/* initialization to perform when the nfsd service is started: */ + +int +nfs4_state_start(void) +{ + int ret; + ret = set_callback_cred(); - if (ret) { - ret = -ENOMEM; - goto out_recovery; - } + if (ret) + return -ENOMEM; laundry_wq = create_singlethread_workqueue("nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; @@ -4900,14 +4899,26 @@ nfs4_state_start(void) if (ret) goto out_free_laundry; - queue_delayed_work(laundry_wq, &nn->laundromat_work, nfsd4_grace * HZ); set_max_delegations(); + + /* + * FIXME: For now, we hang most of the pernet global stuff off of + * init_net until nfsd is fully containerized. Eventually, we'll + * need to pass a net pointer into this function, take a reference + * to that instead and then do most of the rest of this on a per-net + * basis. + */ + ret = nfs4_state_start_net(&init_net); + if (ret) + goto out_free_callback; + return 0; + +out_free_callback: + nfsd4_destroy_callback_queue(); out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: - nfsd4_client_tracking_exit(net); - nfs4_state_destroy_net(net); return ret; } -- cgit v1.2.1 From f252bc6806a9428f2e3a429e4cdffbd012de9839 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 15:22:18 +0300 Subject: nfsd: call state init and shutdown twice Split NFSv4 state init and shutdown into two different calls: per-net one and generic one. Per-net cwinit/shutdown pair have to be called for any namespace, generic pair - only once on NSFd kthreads start and shutdown respectively. Refresh of diff-nfsd-call-state-init-twice Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 30 ++++++++++++------------------ fs/nfsd/nfsd.h | 4 ++++ fs/nfsd/nfssvc.c | 15 +++++++++++++-- 3 files changed, 29 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6f5798623eb1..fb98f291aac2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4861,12 +4861,22 @@ nfs4_state_destroy_net(struct net *net) put_net(net); } -static int +int nfs4_state_start_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; + /* + * FIXME: For now, we hang most of the pernet global stuff off of + * init_net until nfsd is fully containerized. Eventually, we'll + * need to pass a net pointer into this function, take a reference + * to that instead and then do most of the rest of this on a per-net + * basis. + */ + if (net != &init_net) + return -EINVAL; + ret = nfs4_state_create_net(net); if (ret) return ret; @@ -4901,21 +4911,8 @@ nfs4_state_start(void) set_max_delegations(); - /* - * FIXME: For now, we hang most of the pernet global stuff off of - * init_net until nfsd is fully containerized. Eventually, we'll - * need to pass a net pointer into this function, take a reference - * to that instead and then do most of the rest of this on a per-net - * basis. - */ - ret = nfs4_state_start_net(&init_net); - if (ret) - goto out_free_callback; - return 0; -out_free_callback: - nfsd4_destroy_callback_queue(); out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: @@ -4923,7 +4920,7 @@ out_recovery: } /* should be called with the state lock held */ -static void +void nfs4_state_shutdown_net(struct net *net) { struct nfs4_delegation *dp = NULL; @@ -4954,9 +4951,6 @@ nfs4_state_shutdown_net(struct net *net) void nfs4_state_shutdown(void) { - struct net *net = &init_net; - - nfs4_state_shutdown_net(net); destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 80d5ce40aadb..d7b210b735e1 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -121,7 +121,9 @@ void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); +int nfs4_state_start_net(struct net *net); void nfs4_state_shutdown(void); +void nfs4_state_shutdown_net(struct net *net); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); @@ -130,7 +132,9 @@ static inline void nfs4_state_init(void) { } static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } +static inline int nfs4_state_start_net(struct net *net) { return 0; } static inline void nfs4_state_shutdown(void) { } +static inline void nfs4_state_shutdown_net(struct net *net) { } static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } static inline char * nfs4_recoverydir(void) {return NULL; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 30d3784d0280..b34a67d8ec44 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -207,6 +207,7 @@ static bool nfsd_up = false; static int nfsd_startup(int nrservs) { int ret; + struct net *net = &init_net; if (nfsd_up) return 0; @@ -221,14 +222,21 @@ static int nfsd_startup(int nrservs) ret = nfsd_init_socks(); if (ret) goto out_racache; - ret = lockd_up(&init_net); + ret = lockd_up(net); if (ret) goto out_racache; ret = nfs4_state_start(); if (ret) goto out_lockd; + + ret = nfs4_state_start_net(net); + if (ret) + goto out_net_state; + nfsd_up = true; return 0; +out_net_state: + nfs4_state_shutdown(); out_lockd: lockd_down(&init_net); out_racache: @@ -238,6 +246,8 @@ out_racache: static void nfsd_shutdown(void) { + struct net *net = &init_net; + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -246,8 +256,9 @@ static void nfsd_shutdown(void) */ if (!nfsd_up) return; + nfs4_state_shutdown_net(net); nfs4_state_shutdown(); - lockd_down(&init_net); + lockd_down(net); nfsd_racache_shutdown(); nfsd_up = false; } -- cgit v1.2.1 From 3a0733692f6665a28c50ebadb6d9db2b183bcb91 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 16:16:25 +0300 Subject: nfsd: recovery - make rec_file per net Opening and closing of this file is done in client tracking init and exit operations. Client tracking is done in network namespace context already. So let's make this file opened and closed per network context - this will simlify it's management. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4recover.c | 70 +++++++++++++++++++++++++-------------------------- 2 files changed, 37 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 08d5fa1ce82a..130563210c68 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,8 @@ struct nfsd_net { /* client_lock protects the client lru list and session hash table */ spinlock_t client_lock; + + struct file *rec_file; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 651d5134e74c..3e76d281bba8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -62,7 +62,6 @@ struct nfsd4_client_tracking_ops { }; /* Globals */ -static struct file *rec_file; static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static struct nfsd4_client_tracking_ops *client_tracking_ops; static bool in_grace; @@ -182,7 +181,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - if (!rec_file) + if (!nn->rec_file) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); @@ -193,11 +192,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (status < 0) return; - status = mnt_want_write_file(rec_file); + status = mnt_want_write_file(nn->rec_file); if (status) return; - dir = rec_file->f_path.dentry; + dir = nn->rec_file->f_path.dentry; /* lock the parent */ mutex_lock(&dir->d_inode->i_mutex); @@ -227,14 +226,14 @@ out_unlock: if (crp) crp->cr_clp = clp; } - vfs_fsync(rec_file, 0); + vfs_fsync(nn->rec_file, 0); } else { printk(KERN_ERR "NFSD: failed to write recovery record" " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); } - mnt_drop_write_file(rec_file); + mnt_drop_write_file(nn->rec_file); nfs4_reset_creds(original_cred); } @@ -267,7 +266,7 @@ static int nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; - struct dentry *dir = rec_file->f_path.dentry; + struct dentry *dir = nn->rec_file->f_path.dentry; LIST_HEAD(names); int status; @@ -275,13 +274,13 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) if (status < 0) return status; - status = vfs_llseek(rec_file, 0, SEEK_SET); + status = vfs_llseek(nn->rec_file, 0, SEEK_SET); if (status < 0) { nfs4_reset_creds(original_cred); return status; } - status = vfs_readdir(rec_file, nfsd4_build_namelist, &names); + status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); while (!list_empty(&names)) { struct name_list *entry; @@ -305,14 +304,14 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } static int -nfsd4_unlink_clid_dir(char *name, int namlen) +nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) { struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - dir = rec_file->f_path.dentry; + dir = nn->rec_file->f_path.dentry; mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { @@ -339,14 +338,14 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return legacy_recdir_name_error(status); - status = mnt_want_write_file(rec_file); + status = mnt_want_write_file(nn->rec_file); if (status) goto out; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -355,10 +354,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (status < 0) goto out_drop_write; - status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1); + status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); nfs4_reset_creds(original_cred); if (status == 0) { - vfs_fsync(rec_file, 0); + vfs_fsync(nn->rec_file, 0); if (in_grace) { /* remove reclaim record */ crp = nfsd4_find_reclaim_client(dname, nn); @@ -367,7 +366,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) } } out_drop_write: - mnt_drop_write_file(rec_file); + mnt_drop_write_file(nn->rec_file); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -396,20 +395,20 @@ nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) int status; in_grace = false; - if (!rec_file) + if (!nn->rec_file) return; - status = mnt_want_write_file(rec_file); + status = mnt_want_write_file(nn->rec_file); if (status) goto out; status = nfsd4_list_rec_dir(purge_old, nn); if (status == 0) - vfs_fsync(rec_file, 0); - mnt_drop_write_file(rec_file); + vfs_fsync(nn->rec_file, 0); + mnt_drop_write_file(nn->rec_file); out: nfs4_release_reclaim(nn); if (status) printk("nfsd4: failed to purge old clients from recovery" - " directory %s\n", rec_file->f_path.dentry->d_name.name); + " directory %s\n", nn->rec_file->f_path.dentry->d_name.name); } static int @@ -430,13 +429,13 @@ nfsd4_recdir_load(struct net *net) { int status; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - if (!rec_file) + if (!nn->rec_file) return 0; status = nfsd4_list_rec_dir(load_recdir, nn); if (status) printk("nfsd4: failed loading clients from recovery" - " directory %s\n", rec_file->f_path.dentry->d_name.name); + " directory %s\n", nn->rec_file->f_path.dentry->d_name.name); return status; } @@ -445,15 +444,16 @@ nfsd4_recdir_load(struct net *net) { */ static int -nfsd4_init_recdir(void) +nfsd4_init_recdir(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", user_recovery_dirname); - BUG_ON(rec_file); + BUG_ON(nn->rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { @@ -463,12 +463,12 @@ nfsd4_init_recdir(void) return status; } - rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); - if (IS_ERR(rec_file)) { + nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(nn->rec_file)) { printk("NFSD: unable to find recovery directory %s\n", user_recovery_dirname); - status = PTR_ERR(rec_file); - rec_file = NULL; + status = PTR_ERR(nn->rec_file); + nn->rec_file = NULL; } nfs4_reset_creds(original_cred); @@ -509,7 +509,7 @@ nfsd4_load_reboot_recovery_data(struct net *net) { int status; - status = nfsd4_init_recdir(); + status = nfsd4_init_recdir(net); if (!status) status = nfsd4_recdir_load(net); if (status) @@ -544,12 +544,12 @@ err: } static void -nfsd4_shutdown_recdir(void) +nfsd4_shutdown_recdir(struct nfsd_net *nn) { - if (!rec_file) + if (!nn->rec_file) return; - fput(rec_file); - rec_file = NULL; + fput(nn->rec_file); + nn->rec_file = NULL; } static void @@ -558,7 +558,7 @@ nfsd4_legacy_tracking_exit(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); - nfsd4_shutdown_recdir(); + nfsd4_shutdown_recdir(nn); nfs4_legacy_state_shutdown(net); } -- cgit v1.2.1 From f141f79d709de447c8c92ba54821740ae53a5d07 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 16:16:30 +0300 Subject: nfsd: recovery - make in_grace per net Flag in_grace is a part of client tracking state, which is network namesapce aware. So let'a replace global static variable with per-net one. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4recover.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 130563210c68..9047706b3e10 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -86,6 +86,7 @@ struct nfsd_net { spinlock_t client_lock; struct file *rec_file; + bool in_grace; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3e76d281bba8..359793f89493 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -64,7 +64,6 @@ struct nfsd4_client_tracking_ops { /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static struct nfsd4_client_tracking_ops *client_tracking_ops; -static bool in_grace; static int nfs4_save_creds(const struct cred **original_creds) @@ -221,7 +220,7 @@ out_put: out_unlock: mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { - if (in_grace) { + if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); if (crp) crp->cr_clp = clp; @@ -358,7 +357,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(nn->rec_file, 0); - if (in_grace) { + if (nn->in_grace) { /* remove reclaim record */ crp = nfsd4_find_reclaim_client(dname, nn); if (crp) @@ -394,7 +393,7 @@ nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) { int status; - in_grace = false; + nn->in_grace = false; if (!nn->rec_file) return; status = mnt_want_write_file(nn->rec_file); @@ -473,7 +472,7 @@ nfsd4_init_recdir(struct net *net) nfs4_reset_creds(original_cred); if (!status) - in_grace = true; + nn->in_grace = true; return status; } -- cgit v1.2.1 From 864aee5c6f90533984c356494e6b0a8070e5d5f2 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 27 Nov 2012 14:42:20 +0300 Subject: nfsd: remove redundant declarations This is a cleanup patch. Functions nfsd_pool_stats_open() and nfsd_pool_stats_release() are declared in fs/nfsd/nfsd.h. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index dab350dfc376..f5ab74af6ce2 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -186,9 +186,6 @@ static struct file_operations supported_enctypes_ops = { }; #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ -extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); -extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); - static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, -- cgit v1.2.1 From 3d7337115d06f21970e23684f4d2e62e3a44c572 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 27 Nov 2012 14:11:44 +0300 Subject: nfsd: make NFSv4 lease time per net Lease time is a part of NFSv4 state engine, which is constructed per network namespace. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4callback.c | 8 +++++--- fs/nfsd/nfs4state.c | 11 +++++------ fs/nfsd/nfs4xdr.c | 4 +++- fs/nfsd/nfsctl.c | 5 ++++- fs/nfsd/nfsd.h | 1 - 6 files changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9047706b3e10..0c20be82cb01 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -87,6 +87,8 @@ struct nfsd_net { struct file *rec_file; bool in_grace; + + time_t nfsd4_lease; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 826cc269c445..99bc85ff0217 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -36,6 +36,7 @@ #include #include "nfsd.h" #include "state.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -625,9 +626,10 @@ static const struct rpc_program cb_program = { .pipe_dir_name = "nfsd4_cb", }; -static int max_cb_time(void) +static int max_cb_time(struct net *net) { - return max(nfsd4_lease/10, (time_t)1) * HZ; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + return max(nn->nfsd4_lease/10, (time_t)1) * HZ; } static struct rpc_cred *callback_cred; @@ -659,7 +661,7 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { struct rpc_timeout timeparms = { - .to_initval = max_cb_time(), + .to_initval = max_cb_time(clp->net), .to_retries = 0, }; struct rpc_create_args args = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb98f291aac2..932b2ca6f203 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -51,7 +51,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; #define all_ones {{~0,~0},~0} @@ -3184,7 +3183,7 @@ nfsd4_end_grace(struct nfsd_net *nn) * to see the (possibly new, possibly shorter) lease time, we * can safely set the next grace time to the current lease time: */ - nfsd4_grace = nfsd4_lease; + nfsd4_grace = nn->nfsd4_lease; } static time_t @@ -3194,9 +3193,9 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; - time_t cutoff = get_seconds() - nfsd4_lease; - time_t t, clientid_val = nfsd4_lease; - time_t u, test_val = nfsd4_lease; + time_t cutoff = get_seconds() - nn->nfsd4_lease; + time_t t, clientid_val = nn->nfsd4_lease; + time_t u, test_val = nn->nfsd4_lease; nfs4_lock_state(); @@ -3245,7 +3244,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); unhash_delegation(dp); } - test_val = nfsd4_lease; + test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { oo = container_of(pos, struct nfs4_openowner, oo_close_lru); if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 250171c5c311..b775366a0a68 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -53,6 +53,7 @@ #include "vfs.h" #include "state.h" #include "cache.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -2052,6 +2053,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, .mnt = exp->ex_path.mnt, .dentry = dentry, }; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); @@ -2212,7 +2214,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_LEASE_TIME) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(nfsd4_lease); + WRITE32(nn->nfsd4_lease); } if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { if ((buflen -= 4) < 0) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f5ab74af6ce2..09d909a42ece 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -909,7 +909,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_ */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { - return nfsd4_write_time(file, buf, size, &nfsd4_lease); + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease); } /** @@ -1060,6 +1061,7 @@ int nfsd_net_id; static __net_init int nfsd_init_net(struct net *net) { int retval; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); retval = nfsd_export_init(net); if (retval) @@ -1067,6 +1069,7 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; + nn->nfsd4_lease = 90; /* default lease time */ return 0; out_idmap_error: diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d7b210b735e1..a8f7325a9124 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -276,7 +276,6 @@ extern struct timeval nfssvc_boot; #ifdef CONFIG_NFSD_V4 -extern time_t nfsd4_lease; extern time_t nfsd4_grace; /* before processing a COMPOUND operation, we have to check that there -- cgit v1.2.1 From 5284b44e438580a50e8cc5189297a73a48a45ecb Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 27 Nov 2012 14:11:49 +0300 Subject: nfsd: make NFSv4 grace time per net Grace time is a part of NFSv4 state engine, which is constructed per network namespace. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 9 +++------ fs/nfsd/nfsctl.c | 4 +++- fs/nfsd/nfsd.h | 2 -- 4 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 0c20be82cb01..2c4b2e2896dd 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -89,6 +89,7 @@ struct nfsd_net { bool in_grace; time_t nfsd4_lease; + time_t nfsd4_grace; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 932b2ca6f203..3db7617e6d39 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -50,9 +50,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -/* Globals */ -time_t nfsd4_grace = 90; - #define all_ones {{~0,~0},~0} static const stateid_t one_stateid = { .si_generation = ~0, @@ -3183,7 +3180,7 @@ nfsd4_end_grace(struct nfsd_net *nn) * to see the (possibly new, possibly shorter) lease time, we * can safely set the next grace time to the current lease time: */ - nfsd4_grace = nn->nfsd4_lease; + nn->nfsd4_grace = nn->nfsd4_lease; } static time_t @@ -4884,8 +4881,8 @@ nfs4_state_start_net(struct net *net) locks_start_grace(net, &nn->nfsd4_manager); nn->grace_ended = false; printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", - nfsd4_grace, net); - queue_delayed_work(laundry_wq, &nn->laundromat_work, nfsd4_grace * HZ); + nn->nfsd4_grace, net); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 09d909a42ece..d902f83681e7 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -925,7 +925,8 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { - return nfsd4_write_time(file, buf, size, &nfsd4_grace); + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace); } static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) @@ -1070,6 +1071,7 @@ static __net_init int nfsd_init_net(struct net *net) if (retval) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; return 0; out_idmap_error: diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index a8f7325a9124..5eea0f5021fd 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -276,8 +276,6 @@ extern struct timeval nfssvc_boot; #ifdef CONFIG_NFSD_V4 -extern time_t nfsd4_grace; - /* before processing a COMPOUND operation, we have to check that there * is enough space in the buffer for XDR encode to succeed. otherwise, * we might process an operation with side effects, and be unable to -- cgit v1.2.1 From c772aa92b6deb2857d4b39a5cc3bd3679cc5f4a6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 28 Nov 2012 15:27:54 +0400 Subject: CIFS: Fix wrong buffer pointer usage in smb_set_file_info Commit 6bdf6dbd662176c0da5c3ac8ed10ac94e7776c85 caused a regression in setattr codepath that leads to files with wrong attributes. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/smb1ops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 56cc4be87807..34cea2798333 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -766,7 +766,6 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; - FILE_BASIC_INFO info_buf; /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, true); @@ -817,7 +816,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, netpid = current->tgid; set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); + rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); if (!rc) cinode->cifsAttrs = le32_to_cpu(buf->Attributes); -- cgit v1.2.1 From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36cae..d3696708fc1a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } -- cgit v1.2.1 From 91dd8c114499e9818f2d5919ef0b9eee61810220 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 28 Nov 2012 12:32:26 -0500 Subject: ext4: prevent race while walking extent tree for fiemap Currently ext4_ext_walk_space() only takes i_data_sem for read when searching for the extent at given block with ext4_ext_find_extent(). Then it drops the lock and the extent tree can be changed at will. However later on we're searching for the 'next' extent, but the extent tree might already have changed, so the information might not be accurate. In fact we can hit BUG_ON(end <= start) if the extent got inserted into the tree after the one we found and before the block we were searching for. This has been reproduced by running xfstests 225 in loop on s390x architecture, but theoretically we could hit this on any other architecture as well, but probably not as often. Moreover the extent currently in delayed allocation might be allocated after we search the extent tree and before we search extent status tree delayed buffers resulting in those delayed buffers being completely missed, even though completely written and allocated. We fix all those problems in several steps: 1. remove unnecessary callback indirection 2. rename functions ext4_ext_walk_space -> ext4_fill_fiemap_extents ext4_ext_fiemap_cb -> ext4_find_delayed_extent 3. move fiemap_fill_next_extent() into ext4_fill_fiemap_extents() 4. hold the i_data_sem for: ext4_ext_find_extent() ext4_ext_next_allocated_block() ext4_find_delayed_extent() 5. call fiemap_fill_next_extent after releasing the i_data_sem 6. move path reinitialization into the critical section. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 14 ----- fs/ext4/extents.c | 136 +++++++++++++++++++++++++++---------------------- 2 files changed, 76 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 603bb114735c..173b6c545323 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -143,20 +143,6 @@ struct ext4_ext_path { * structure for external API */ -/* - * to be called by ext4_ext_walk_space() - * negative retcode - error - * positive retcode - signal for ext4_ext_walk_space(), see below - * callback must return valid extent (passed or newly created) - */ -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, - struct ext4_ext_cache *, - struct ext4_extent *, void *); - -#define EXT_CONTINUE 0 -#define EXT_BREAK 1 -#define EXT_REPEAT 2 - /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d3dd6182c07a..fbe7dc284240 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -109,6 +109,9 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -1959,27 +1962,33 @@ cleanup: return err; } -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; struct ext4_ext_cache cbex; struct ext4_extent *ex; - ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; - int depth, exists, err = 0; - - BUG_ON(func == NULL); - BUG_ON(inode == NULL); + int exists, depth = 0, err = 0; + unsigned int flags = 0; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); + + if (path && ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + path = ext4_ext_find_extent(inode, block, path); - up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { + up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; @@ -1987,13 +1996,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { + up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EIO; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); + ext4_ext_drop_refs(path); + flags = 0; exists = 0; if (!ex) { /* there is no extent yet, so try to allocate @@ -2037,30 +2049,54 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext4_ext_pblock(ex); + if (ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; } + /* + * Find delayed extent and update cbex accordingly. We call + * it even in !exists case to find out whether cbex is the + * last existing extent or not. + */ + next_del = ext4_find_delayed_extent(inode, &cbex); + if (!exists && next_del) { + exists = 1; + flags |= FIEMAP_EXTENT_DELALLOC; + } + up_read(&EXT4_I(inode)->i_data_sem); + if (unlikely(cbex.ec_len == 0)) { EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); err = -EIO; break; } - err = func(inode, next, &cbex, ex, cbdata); - ext4_ext_drop_refs(path); - - if (err < 0) - break; - if (err == EXT_REPEAT) - continue; - else if (err == EXT_BREAK) { - err = 0; - break; + /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ + if (next == next_del) { + flags |= FIEMAP_EXTENT_LAST; + if (unlikely(next_del != EXT_MAX_BLOCKS || + next != EXT_MAX_BLOCKS)) { + EXT4_ERROR_INODE(inode, + "next extent == %u, next " + "delalloc extent = %u", + next, next_del); + err = -EIO; + break; + } } - if (ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; + if (exists) { + err = fiemap_fill_next_extent(fieinfo, + (__u64)cbex.ec_block << blksize_bits, + (__u64)cbex.ec_start << blksize_bits, + (__u64)cbex.ec_len << blksize_bits, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } } block = cbex.ec_block + cbex.ec_len; @@ -4493,26 +4529,23 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * Callback function called for each extent to gather FIEMAP information. + * If newex is not existing extent (newex->ec_start equals zero) find + * delayed extent at start of newex and update newex accordingly and + * return start of the next delayed extent. + * + * If newex is existing extent (newex->ec_start is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newex unmodified. */ -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, - struct ext4_ext_cache *newex, struct ext4_extent *ex, - void *data) +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex) { struct extent_status es; - __u64 logical; - __u64 physical; - __u64 length; - __u32 flags = 0; ext4_lblk_t next_del; - int ret = 0; - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits; es.start = newex->ec_block; next_del = ext4_es_find_extent(inode, &es); - next = min(next_del, next); if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, @@ -4520,37 +4553,19 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, */ if (es.len == 0) /* A hole found. */ - return EXT_CONTINUE; + return 0; if (es.start > newex->ec_block) { /* A hole found. */ newex->ec_len = min(es.start - newex->ec_block, newex->ec_len); - return EXT_CONTINUE; + return 0; } - flags |= FIEMAP_EXTENT_DELALLOC; newex->ec_len = es.start + es.len - newex->ec_block; } - if (ex && ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - - if (next == EXT_MAX_BLOCKS) - flags |= FIEMAP_EXTENT_LAST; - - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - - ret = fiemap_fill_next_extent(fieinfo, logical, physical, - length, flags); - if (ret < 0) - return ret; - if (ret == 1) - return EXT_BREAK; - return EXT_CONTINUE; + return next_del; } /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4772,6 +4787,7 @@ out_mutex: mutex_unlock(&inode->i_mutex); return err; } + int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@ -4799,11 +4815,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* - * Walk the extent tree gathering extent information. - * ext4_ext_fiemap_cb will push extents back to user. + * Walk the extent tree gathering extent information + * and pushing extents back to the user. */ - error = ext4_ext_walk_space(inode, start_blk, len_blks, - ext4_ext_fiemap_cb, fieinfo); + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); } return error; -- cgit v1.2.1 From 06348679c9f69b3b031cf84c1f5f9f2488fc1f7d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 28 Nov 2012 12:33:22 -0500 Subject: ext4: simple cleanup in fiemap codepath This commit is simple cleanup of fiemap codepath which has not been included in previous commit to make the changes clearer. In this commit we rename cbex variable to newex in ext4_fill_fiemap_extents() because callback is no longer present Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fbe7dc284240..56251466750c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1967,7 +1967,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache cbex; + struct ext4_ext_cache newex; struct ext4_extent *ex; ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; @@ -2042,31 +2042,31 @@ static int ext4_fill_fiemap_extents(struct inode *inode, BUG_ON(end <= start); if (!exists) { - cbex.ec_block = start; - cbex.ec_len = end - start; - cbex.ec_start = 0; + newex.ec_block = start; + newex.ec_len = end - start; + newex.ec_start = 0; } else { - cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext4_ext_pblock(ex); + newex.ec_block = le32_to_cpu(ex->ee_block); + newex.ec_len = ext4_ext_get_actual_len(ex); + newex.ec_start = ext4_ext_pblock(ex); if (ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } /* - * Find delayed extent and update cbex accordingly. We call - * it even in !exists case to find out whether cbex is the + * Find delayed extent and update newex accordingly. We call + * it even in !exists case to find out whether newex is the * last existing extent or not. */ - next_del = ext4_find_delayed_extent(inode, &cbex); + next_del = ext4_find_delayed_extent(inode, &newex); if (!exists && next_del) { exists = 1; flags |= FIEMAP_EXTENT_DELALLOC; } up_read(&EXT4_I(inode)->i_data_sem); - if (unlikely(cbex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + if (unlikely(newex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); err = -EIO; break; } @@ -2087,9 +2087,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (exists) { err = fiemap_fill_next_extent(fieinfo, - (__u64)cbex.ec_block << blksize_bits, - (__u64)cbex.ec_start << blksize_bits, - (__u64)cbex.ec_len << blksize_bits, + (__u64)newex.ec_block << blksize_bits, + (__u64)newex.ec_start << blksize_bits, + (__u64)newex.ec_len << blksize_bits, flags); if (err < 0) break; @@ -2099,7 +2099,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } } - block = cbex.ec_block + cbex.ec_len; + block = newex.ec_block + newex.ec_len; } if (path) { -- cgit v1.2.1 From 766f44d46a726cb59f52a75c5c87425a10c4bade Mon Sep 17 00:00:00 2001 From: Vahram Martirosyan Date: Wed, 28 Nov 2012 12:44:16 -0500 Subject: ext4: fixed potential NULL dereference in ext4_calculate_overhead() The memset operation before check can cause a BUG if the memory allocation failed. Since we are using get_zeroed_age, there is no need to use memset anyway. Found by the Spruce system in cooperation with the KEDR Framework. Signed-off-by: Vahram Martirosyan Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad6cd8aeb946..66a4e20424cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3206,7 +3206,6 @@ int ext4_calculate_overhead(struct super_block *sb) ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_KERNEL); - memset(buf, 0, PAGE_SIZE); if (!buf) return -ENOMEM; -- cgit v1.2.1 From f3c7521fe53a7892d8c8c4715f7c0f4add7b2e19 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Tue, 27 Nov 2012 09:35:10 -0500 Subject: NFSD: Fold fault_inject.h into state.h There were only a small number of functions in this file and since they all affect stored state I think it makes sense to put them in state.h instead. I also dropped most static inline declarations since there are no callers when fault injection is not enabled. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 1 - fs/nfsd/fault_inject.h | 28 ---------------------------- fs/nfsd/nfs4state.c | 1 - fs/nfsd/nfsctl.c | 2 +- fs/nfsd/state.h | 15 +++++++++++++++ 5 files changed, 16 insertions(+), 31 deletions(-) delete mode 100644 fs/nfsd/fault_inject.h (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index e6c38159622f..02781121c6b0 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -10,7 +10,6 @@ #include #include "state.h" -#include "fault_inject.h" struct nfsd_fault_inject_op { char *file; diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h deleted file mode 100644 index 90bd0570956c..000000000000 --- a/fs/nfsd/fault_inject.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2011 Bryan Schumaker - * - * Function definitions for fault injection - */ - -#ifndef LINUX_NFSD_FAULT_INJECT_H -#define LINUX_NFSD_FAULT_INJECT_H - -#ifdef CONFIG_NFSD_FAULT_INJECTION -int nfsd_fault_inject_init(void); -void nfsd_fault_inject_cleanup(void); -void nfsd_forget_clients(u64); -void nfsd_forget_locks(u64); -void nfsd_forget_openowners(u64); -void nfsd_forget_delegations(u64); -void nfsd_recall_delegations(u64); -#else /* CONFIG_NFSD_FAULT_INJECTION */ -static inline int nfsd_fault_inject_init(void) { return 0; } -static inline void nfsd_fault_inject_cleanup(void) {} -static inline void nfsd_forget_clients(u64 num) {} -static inline void nfsd_forget_locks(u64 num) {} -static inline void nfsd_forget_openowners(u64 num) {} -static inline void nfsd_forget_delegations(u64 num) {} -static inline void nfsd_recall_delegations(u64 num) {} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ - -#endif /* LINUX_NFSD_FAULT_INJECT_H */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3db7617e6d39..b1aa577dd869 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,7 +44,6 @@ #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" -#include "fault_inject.h" #include "netns.h" diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d902f83681e7..e13cbddcdbd0 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -19,7 +19,7 @@ #include "idmap.h" #include "nfsd.h" #include "cache.h" -#include "fault_inject.h" +#include "state.h" #include "netns.h" /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2deb6a88e58e..b542bf2c0fe7 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -496,4 +496,19 @@ extern void nfsd4_client_record_create(struct nfs4_client *clp); extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); + +/* nfs fault injection functions */ +#ifdef CONFIG_NFSD_FAULT_INJECTION +int nfsd_fault_inject_init(void); +void nfsd_fault_inject_cleanup(void); +void nfsd_forget_clients(u64); +void nfsd_forget_locks(u64); +void nfsd_forget_openowners(u64); +void nfsd_forget_delegations(u64); +void nfsd_recall_delegations(u64); +#else /* CONFIG_NFSD_FAULT_INJECTION */ +static inline int nfsd_fault_inject_init(void) { return 0; } +static inline void nfsd_fault_inject_cleanup(void) {} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + #endif /* NFSD4_STATE_H */ -- cgit v1.2.1 From 4a092d737955301da22b9d5e07f5036da821a932 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Nov 2012 13:03:30 -0500 Subject: ext4: rationalize ext4_extents.h inclusion Previously, ext4_extents.h was being included at the end of ext4.h, which was bad for a number of reasons: (a) it was not being included in the expected place, and (b) it caused the header to be included multiple times. There were #ifdef's to prevent this from causing any problems, but it still was unnecessary. By moving the function declarations that were in ext4_extents.h to ext4.h, which is standard practice for where the function declarations for the rest of ext4.h can be found, we can remove ext4_extents.h from being included in ext4.h at all, and then we can only include ext4_extents.h where it is needed in ext4's source files. It should be possible to move a few more things into ext4.h, and further reduce the number of source files that need to #include ext4_extents.h, but that's a cleanup for another day. Reported-by: Sachin Kamat Reported-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 34 ++++++++++++++++++++++++++++++++-- fs/ext4/ext4_extents.h | 25 ------------------------- fs/ext4/extents.c | 1 + fs/ext4/indirect.c | 1 + fs/ext4/migrate.c | 1 + fs/ext4/move_extent.c | 1 + fs/ext4/page-io.c | 1 - fs/ext4/super.c | 3 +-- 8 files changed, 37 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 246e38f3915a..2e9ffa9100bb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,6 +57,16 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) @@ -2399,6 +2409,9 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, @@ -2416,8 +2429,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); + + /* move_extent.c */ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, @@ -2505,6 +2537,4 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ -#include "ext4_extents.h" - #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 173b6c545323..487fda12bc00 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -42,16 +42,6 @@ */ #define CHECK_BINSEARCH__ -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. @@ -286,20 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 56251466750c..1dc19a7b449f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -41,6 +41,7 @@ #include #include #include "ext4_jbd2.h" +#include "ext4_extents.h" #include diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index f6663c3a946d..20862f96e8ae 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f1bb32ec0169..db8226d595fa 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -14,6 +14,7 @@ #include #include "ext4_jbd2.h" +#include "ext4_extents.h" /* * The contiguous blocks details which can be diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 292daeeed455..d9cc5ee42f53 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -18,6 +18,7 @@ #include #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" /** * get_ext_path - Find an extent path for designated logical block number. diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0fd16e653ebd..0016fbca2a40 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" static struct kmem_cache *io_page_cachep, *io_end_cachep; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 66a4e20424cf..856206f255aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,12 +45,11 @@ #include #include "ext4.h" -#include "ext4_extents.h" +#include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" -#include "ext4_extents.h" #define CREATE_TRACE_POINTS #include -- cgit v1.2.1 From c4144670fd9b34d6eae22c9f83751745898e8243 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 Oct 2012 16:34:38 -0400 Subject: kill daemonize() Signed-off-by: Al Viro --- fs/file.c | 6 ------ fs/fs_struct.c | 24 ------------------------ 2 files changed, 30 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 7cb71b992603..7272a1c5831d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -519,12 +519,6 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; -void daemonize_descriptors(void) -{ - atomic_inc(&init_files.count); - reset_files_struct(&init_files); -} - /* * allocate a file descriptor, mark it busy. */ diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 5df4775fea03..fe6ca583bbc0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -164,27 +164,3 @@ struct fs_struct init_fs = { .seq = SEQCNT_ZERO, .umask = 0022, }; - -void daemonize_fs_struct(void) -{ - struct fs_struct *fs = current->fs; - - if (fs) { - int kill; - - task_lock(current); - - spin_lock(&init_fs.lock); - init_fs.users++; - spin_unlock(&init_fs.lock); - - spin_lock(&fs->lock); - current->fs = &init_fs; - kill = !--fs->users; - spin_unlock(&fs->lock); - - task_unlock(current); - if (kill) - free_fs_struct(fs); - } -} -- cgit v1.2.1 From d03d26e58fde2ec99478e26aab47b55755189b08 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 21:46:25 -0400 Subject: make compat_do_execve() static, lose pt_regs argument Signed-off-by: Al Viro --- fs/exec.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0039055b1fc6..f86b6cc2d6cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1575,10 +1575,9 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -int compat_do_execve(const char *filename, +static int compat_do_execve(const char *filename, const compat_uptr_t __user *__argv, - const compat_uptr_t __user *__envp, - struct pt_regs *regs) + const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, @@ -1588,7 +1587,7 @@ int compat_do_execve(const char *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp, regs); + return do_execve_common(filename, argv, envp, current_pt_regs()); } #endif @@ -1682,8 +1681,7 @@ asmlinkage long compat_sys_execve(const char __user * filename, struct filename *path = getname(filename); int error = PTR_ERR(path); if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp, - current_pt_regs()); + error = compat_do_execve(path->name, argv, envp); putname(path); } return error; -- cgit v1.2.1 From da3d4c5fa56236dd924d77ffc4f982356816b93b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 21:49:33 -0400 Subject: get rid of pt_regs argument of do_execve() Signed-off-by: Al Viro --- fs/exec.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index f86b6cc2d6cc..5797ed07efd3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1566,12 +1566,11 @@ out_ret: int do_execve(const char *filename, const char __user *const __user *__argv, - const char __user *const __user *__envp, - struct pt_regs *regs) + const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp, regs); + return do_execve_common(filename, argv, envp, current_pt_regs()); } #ifdef CONFIG_COMPAT @@ -1668,7 +1667,7 @@ SYSCALL_DEFINE3(execve, struct filename *path = getname(filename); int error = PTR_ERR(path); if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp, current_pt_regs()); + error = do_execve(path->name, argv, envp); putname(path); } return error; @@ -1694,12 +1693,9 @@ int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]) { - struct pt_regs *p = current_pt_regs(); - int ret; - - ret = do_execve(filename, + int ret = do_execve(filename, (const char __user *const __user *)argv, - (const char __user *const __user *)envp, p); + (const char __user *const __user *)envp); if (ret < 0) return ret; @@ -1707,6 +1703,6 @@ int kernel_execve(const char *filename, * We were successful. We won't be returning to our caller, but * instead to user space by manipulating the kernel stack. */ - ret_from_kernel_execve(p); + ret_from_kernel_execve(current_pt_regs()); } #endif -- cgit v1.2.1 From 835ab32dff6b437e74c266468b83c4abb69041dc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 21:50:59 -0400 Subject: get rid of pt_regs argument of do_execve_common() Signed-off-by: Al Viro --- fs/exec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 5797ed07efd3..dc5e2830d353 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1439,8 +1439,7 @@ EXPORT_SYMBOL(search_binary_handler); */ static int do_execve_common(const char *filename, struct user_arg_ptr argv, - struct user_arg_ptr envp, - struct pt_regs *regs) + struct user_arg_ptr envp) { struct linux_binprm *bprm; struct file *file; @@ -1448,6 +1447,7 @@ static int do_execve_common(const char *filename, bool clear_in_exec; int retval; const struct cred *cred = current_cred(); + struct pt_regs *regs = current_pt_regs(); /* * We move the actual failure in case of RLIMIT_NPROC excess from @@ -1570,7 +1570,7 @@ int do_execve(const char *filename, { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp, current_pt_regs()); + return do_execve_common(filename, argv, envp); } #ifdef CONFIG_COMPAT @@ -1586,7 +1586,7 @@ static int compat_do_execve(const char *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp, current_pt_regs()); + return do_execve_common(filename, argv, envp); } #endif -- cgit v1.2.1 From 3c456bfc4ba66e9cda210da7bc4fb0ba9fcc6972 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 21:53:31 -0400 Subject: get rid of pt_regs argument of search_binary_handler() Signed-off-by: Al Viro --- fs/binfmt_em86.c | 2 +- fs/binfmt_misc.c | 2 +- fs/binfmt_script.c | 2 +- fs/exec.c | 7 +++---- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 2790c7e1912e..7e125718a75e 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -90,7 +90,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) if (retval < 0) return retval; - return search_binary_handler(bprm, regs); + return search_binary_handler(bprm); } static struct linux_binfmt em86_format = { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 790b3cddca67..226aeac22ac9 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -199,7 +199,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) bprm->recursion_depth++; - retval = search_binary_handler (bprm, regs); + retval = search_binary_handler(bprm); if (retval < 0) goto _error; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d3b8c1f63155..798b729f01d5 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) retval = prepare_binprm(bprm); if (retval < 0) return retval; - return search_binary_handler(bprm,regs); + return search_binary_handler(bprm); } static struct linux_binfmt script_format = { diff --git a/fs/exec.c b/fs/exec.c index dc5e2830d353..2aee7ef10663 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1349,7 +1349,7 @@ EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) +int search_binary_handler(struct linux_binprm *bprm) { unsigned int depth = bprm->recursion_depth; int try,retval; @@ -1380,7 +1380,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); - retval = fn(bprm, regs); + retval = fn(bprm, current_pt_regs()); /* * Restore the depth counter to its starting value * in this call, so we don't have to rely on every @@ -1447,7 +1447,6 @@ static int do_execve_common(const char *filename, bool clear_in_exec; int retval; const struct cred *cred = current_cred(); - struct pt_regs *regs = current_pt_regs(); /* * We move the actual failure in case of RLIMIT_NPROC excess from @@ -1524,7 +1523,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm,regs); + retval = search_binary_handler(bprm); if (retval < 0) goto out; -- cgit v1.2.1 From 71613c3b871c5a9f27cc48f124251bcd3aa23be1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 22:00:48 -0400 Subject: get rid of pt_regs argument of ->load_binary() Signed-off-by: Al Viro --- fs/binfmt_aout.c | 5 +++-- fs/binfmt_elf.c | 5 +++-- fs/binfmt_elf_fdpic.c | 6 +++--- fs/binfmt_em86.c | 2 +- fs/binfmt_flat.c | 5 +++-- fs/binfmt_misc.c | 2 +- fs/binfmt_script.c | 2 +- fs/binfmt_som.c | 5 +++-- fs/exec.c | 4 ++-- 9 files changed, 20 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 0e7a6f81ae36..6043567b95c2 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -30,7 +30,7 @@ #include #include -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file*); #ifdef CONFIG_COREDUMP @@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin * libraries. There is no binary dependent code anywhere else. */ -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_aout_binary(struct linux_binprm * bprm) { + struct pt_regs *regs = current_pt_regs(); struct exec ex; unsigned long error; unsigned long fd_offset; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fbd9f60bd763..6d7d1647a68c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -44,7 +44,7 @@ #define user_siginfo_t siginfo_t #endif -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); +static int load_elf_binary(struct linux_binprm *bprm); static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); @@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) #endif } -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; @@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; unsigned long def_flags = 0; + struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a46049154107..dc84732e554f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -56,7 +56,7 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int load_elf_fdpic_binary(struct linux_binprm *); static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, struct mm_struct *, const char *); @@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, - struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm) { struct elf_fdpic_params exec_params, interp_params; + struct pt_regs *regs = current_pt_regs(); struct elf_phdr *phdr; unsigned long stack_size, entryaddr; #ifdef ELF_FDPIC_PLAT_INIT diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 7e125718a75e..4e6cce57d113 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -22,7 +22,7 @@ #define EM86_INTERP "/usr/bin/em86" #define EM86_I_NAME "em86" -static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_em86(struct linux_binprm *bprm) { char *interp, *i_name, *i_arg; struct file * file; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index e280352b28f9..b56371981d16 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -88,7 +88,7 @@ struct lib_info { static int load_flat_shared_library(int id, struct lib_info *p); #endif -static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_flat_binary(struct linux_binprm *); static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { @@ -858,9 +858,10 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_flat_binary(struct linux_binprm * bprm) { struct lib_info libinfo; + struct pt_regs *regs = current_pt_regs(); unsigned long p = bprm->p; unsigned long stack_len; unsigned long start_addr; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 226aeac22ac9..b0b70fbea06c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm) /* * the loader itself */ -static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file * interp_file = NULL; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 798b729f01d5..8c954997e7f7 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -14,7 +14,7 @@ #include #include -static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_script(struct linux_binprm *bprm) { const char *i_arg, *i_name; char *cp; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 4517aaff61b4..4e00ed68d4a6 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -35,7 +35,7 @@ #include -static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); +static int load_som_binary(struct linux_binprm * bprm); static int load_som_library(struct file *); /* @@ -180,13 +180,14 @@ out: */ static int -load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) +load_som_binary(struct linux_binprm * bprm) { int retval; unsigned int size; unsigned long som_entry; struct som_hdr *som_ex; struct som_exec_auxhdr *hpuxhdr; + struct pt_regs *regs = current_pt_regs(); /* Get the exec-header */ som_ex = (struct som_hdr *) bprm->buf; diff --git a/fs/exec.c b/fs/exec.c index 2aee7ef10663..721a29929511 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1374,13 +1374,13 @@ int search_binary_handler(struct linux_binprm *bprm) for (try=0; try<2; try++) { read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; + int (*fn)(struct linux_binprm *) = fmt->load_binary; if (!fn) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); - retval = fn(bprm, current_pt_regs()); + retval = fn(bprm); /* * Restore the depth counter to its starting value * in this call, so we don't have to rely on every -- cgit v1.2.1 From 541880d9a2c7871f6370071d55aa6662d329c51e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2012 13:11:26 -0500 Subject: do_coredump(): get rid of pt_regs argument Signed-off-by: Al Viro --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index ce47379bfa61..177493272a61 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return err; } -void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) +void do_coredump(siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; @@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = regs, + .regs = signal_pt_regs(), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid -- cgit v1.2.1 From 45bce8f3e3436bbe2e03dd2b076abdce79ffabb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:21:43 -0800 Subject: fs/buffer.c: make block-size be per-page and protected by the page lock This makes the buffer size handling be a per-page thing, which allows us to not have to worry about locking too much when changing the buffer size. If a page doesn't have buffers, we still need to read the block size from the inode, but we can do that with ACCESS_ONCE(), so that even if the size is changing, we get a consistent value. This doesn't convert all functions - many of the buffer functions are used purely by filesystems, which in turn results in the buffer size being fixed at mount-time. So they don't have the same consistency issues that the raw device access can have. Signed-off-by: Linus Torvalds --- fs/buffer.c | 79 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index b5f044283edb..28a74ff5324b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1552,6 +1552,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) } EXPORT_SYMBOL(unmap_underlying_metadata); +/* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + /* * NOTE! All mapped/uptodate combinations are valid: * @@ -1589,19 +1611,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, + head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -1613,9 +1629,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; /* * Get all the dirty buffers mapped to disk addresses and @@ -1806,12 +1825,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1881,11 +1898,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, unsigned blocksize; struct buffer_head *bh, *head; - blocksize = 1 << inode->i_blkbits; + bh = head = page_buffers(page); + blocksize = bh->b_size; - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { + block_start = 0; + do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -1895,7 +1912,10 @@ static int __block_commit_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); } clear_buffer_new(bh); - } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); /* * If this is a partial write which happened to make all buffers @@ -2020,7 +2040,6 @@ EXPORT_SYMBOL(generic_write_end); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { - struct inode *inode = page->mapping->host; unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; @@ -2029,13 +2048,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, if (!page_has_buffers(page)) return 0; - blocksize = 1 << inode->i_blkbits; + head = page_buffers(page); + blocksize = head->b_size; to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; - head = page_buffers(page); bh = head; block_start = 0; do { @@ -2068,18 +2087,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; + unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; -- cgit v1.2.1 From 1e8b33328a5407b447ff80953655a47014a6dcb9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:49:50 -0800 Subject: blockdev: remove bd_block_size_semaphore again This reverts the block-device direct access code to the previous unlocked code, now that fs/buffer.c no longer needs external locking. With this, fs/block_dev.c is back to the original version, apart from a whitespace cleanup that I didn't want to revert. Signed-off-by: Linus Torvalds --- fs/block_dev.c | 105 +++------------------------------------------------------ 1 file changed, 4 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a1e5e3b1eaf..47a949d8a07e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,8 +116,6 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { - struct address_space *mapping; - /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -126,19 +124,6 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; - /* Prevent starting I/O or mapping the device */ - percpu_down_write(&bdev->bd_block_size_semaphore); - - /* Check that the block device is not memory mapped */ - mapping = bdev->bd_inode->i_mapping; - mutex_lock(&mapping->i_mmap_mutex); - if (mapping_mapped(mapping)) { - mutex_unlock(&mapping->i_mmap_mutex); - percpu_up_write(&bdev->bd_block_size_semaphore); - return -EBUSY; - } - mutex_unlock(&mapping->i_mmap_mutex); - /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -146,9 +131,6 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } - - percpu_up_write(&bdev->bd_block_size_semaphore); - return 0; } @@ -459,12 +441,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; - - if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { - kmem_cache_free(bdev_cachep, ei); - return NULL; - } - return &ei->vfs_inode; } @@ -473,8 +449,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); - kmem_cache_free(bdev_cachep, bdi); } @@ -1593,22 +1567,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } -ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} -EXPORT_SYMBOL_GPL(blkdev_aio_read); - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1620,16 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - - percpu_down_read(&bdev->bd_block_size_semaphore); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1638,62 +1592,11 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - - percpu_up_read(&bdev->bd_block_size_semaphore); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - int ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_mmap(file, vma); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_read(file, ppos, pipe, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, - struct file *file, loff_t *ppos, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_write(pipe, file, ppos, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - - /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1724,16 +1627,16 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = blkdev_aio_read, + .aio_read = generic_file_aio_read, .aio_write = blkdev_aio_write, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = blkdev_splice_read, - .splice_write = blkdev_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit v1.2.1 From ef9d873344ff9f5084eacb9f3735982314dfda9e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 29 Nov 2012 15:26:33 +1100 Subject: xfs: byte range granularity for XFS_IOC_ZERO_RANGE XFS_IOC_ZERO_RANGE simply does not work properly for non page cache aligned ranges. Neither test 242 or 290 exercise this correctly, so the behaviour is completely busted even though the tests pass. Fix it to support full byte range granularity as was originally intended for this ioctl. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_vnodeops.c | 96 ++++++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_vnodeops.h | 1 + 3 files changed, 77 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 400b187595bb..67284edb84d7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -86,7 +86,7 @@ xfs_rw_ilock_demote( * valid before the operation, it will be read from disk before * being partially zeroed. */ -STATIC int +int xfs_iozero( struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 26880793feca..d95f565a390e 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2095,6 +2095,73 @@ xfs_free_file_space( return error; } + +STATIC int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (start_boundary < end_boundary - 1) { + /* punch out the page cache over the conversion range */ + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, + attr_flags); + if (error) + goto out_unlock; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out_unlock; + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out_unlock: + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + +} + /* * xfs_change_file_space() * This routine allocates or frees disk space for the given file. @@ -2120,10 +2187,8 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; - int prealloc_type; if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2172,31 +2237,20 @@ xfs_change_file_space( startoffset = bf->l_start; fsize = XFS_ISIZE(ip); - /* - * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve - * file space. - * These calls do NOT zero the data space allocated to the file, - * nor do they change the file size. - * - * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file - * space. - * These calls cause the new file data to be zeroed and the file - * size to be changed. - */ setprealloc = clrprealloc = 0; - prealloc_type = XFS_BMAPI_PREALLOC; - switch (cmd) { case XFS_IOC_ZERO_RANGE: - prealloc_type |= XFS_BMAPI_CONVERT; - end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset <= end) - truncate_pagecache_range(VFS_I(ip), startoffset, end); - /* FALLTHRU */ + error = xfs_zero_file_space(ip, startoffset, bf->l_len, + attr_flags); + if (error) + return error; + setprealloc = 1; + break; + case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - prealloc_type, attr_flags); + XFS_BMAPI_PREALLOC, attr_flags); if (error) return error; setprealloc = 1; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 91a03fa3814f..5163022d9808 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -49,6 +49,7 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.1 From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:00 +1100 Subject: xfs: fix direct IO nested transaction deadlock. The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 81 ++++++++++++++++++++----------------------------------- fs/xfs/xfs_log.c | 3 ++- 2 files changed, 31 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71361da1f77c..4111a40ebe1a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c6d6e136ba77..c49e2c12dba4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -460,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); -- cgit v1.2.1 From b870553cdecb26d5291af09602352b763e323df2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:02 +1100 Subject: xfs: fix stray dquot unlock when reclaiming dquots When we fail to get a dquot lock during reclaim, we jump to an error handler that unlocks the dquot. This is wrong as we didn't lock the dquot, and unlocking it means who-ever is holding the lock has had it silently taken away, and hence it results in a lock imbalance. Found by inspection while modifying the code for the numa-lru patchset. This fixes a random hang I've been seeing on xfstest 232 for the past several months. cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_qm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e6a0af0ba007..60eff4763156 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1456,7 +1456,7 @@ xfs_qm_dqreclaim_one( int error; if (!xfs_dqlock_nowait(dqp)) - goto out_busy; + goto out_move_tail; /* * This dquot has acquired a reference in the meantime remove it from @@ -1479,7 +1479,7 @@ xfs_qm_dqreclaim_one( * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) - goto out_busy; + goto out_unlock_move_tail; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -1490,7 +1490,7 @@ xfs_qm_dqreclaim_one( if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); - goto out_busy; + goto out_unlock_move_tail; } xfs_buf_delwri_queue(bp, buffer_list); @@ -1499,7 +1499,7 @@ xfs_qm_dqreclaim_one( * Give the dquot another try on the freelist, as the * flushing will take some time. */ - goto out_busy; + goto out_unlock_move_tail; } xfs_dqfunlock(dqp); @@ -1518,14 +1518,13 @@ xfs_qm_dqreclaim_one( XFS_STATS_INC(xs_qm_dqreclaims); return; -out_busy: - xfs_dqunlock(dqp); - /* * Move the dquot to the tail of the list so that we don't spin on it. */ +out_unlock_move_tail: + xfs_dqunlock(dqp); +out_move_tail: list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(xs_qm_dqreclaim_misses); } -- cgit v1.2.1 From ab73857e354ab9e317613cba7db714e2c12c6547 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:27:00 -0800 Subject: direct-io: don't read inode->i_blkbits multiple times Since directio can work on a raw block device, and the block size of the device can change under it, we need to do the same thing that fs/buffer.c now does: read the block size a single time, using ACCESS_ONCE(). Reading it multiple times can get different results, which will then confuse the code because it actually encodes the i_blksize in relationship to the underlying logical blocksize. Signed-off-by: Linus Torvalds --- fs/direct-io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index f86c720dba0e..cf5b44b10c67 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; /* * If there was a memory error and we've overwritten all the @@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; - map_bh->b_size = fs_count << dio->inode->i_blkbits; + map_bh->b_size = fs_count << i_blkbits; /* * For writes inside i_size on a DIO_SKIP_HOLES filesystem we @@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int seg; size_t size; unsigned long addr; - unsigned blkbits = inode->i_blkbits; + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->inode = inode; dio->rw = rw; sdio.blkbits = blkbits; - sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; sdio.get_block = get_block; -- cgit v1.2.1 From bbec0270bdd887f96377065ee38b8848b5afa395 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:31:52 -0800 Subject: blkdev_max_block: make private to fs/buffer.c We really don't want to look at the block size for the raw block device accesses in fs/block-dev.c, because it may be changing from under us. So get rid of the max_block logic entirely, since the caller should already have done it anyway. That leaves the only user of this function in fs/buffer.c, so move the whole function there and make it static. Signed-off-by: Linus Torvalds --- fs/block_dev.c | 55 +------------------------------------------------------ fs/buffer.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 47a949d8a07e..a1e09b4fe1ba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -sector_t blkdev_max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -163,52 +150,12 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= blkdev_max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = blkdev_max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) diff --git a/fs/buffer.c b/fs/buffer.c index 28a74ff5324b..3586fb05c8ce 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head) attach_page_buffers(page, head); } +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + /* * Initialise the state of a blockdev page's buffers. */ @@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); do { if (!buffer_mapped(bh)) { -- cgit v1.2.1 From 69c499d152a7fe2c4443e5ddd91568ad5a79145a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Nov 2012 21:13:48 -0500 Subject: ext4: restructure ext4_ext_direct_IO() Remove a level of indentation by moving the DIO read and extending write case to the beginning of the file. This results in no actual programmatic changes to the file, but makes it easier to read/understand. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 211 +++++++++++++++++++++++++++----------------------------- 1 file changed, 103 insertions(+), 108 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cf5d30a7cce3..91a24967b8ae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2927,10 +2927,10 @@ retry: * fall back to buffered IO. * * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are splited, but + * If those blocks were preallocated, we mark sure they are split, but * still keep the range to write as uninitialized. * - * The unwrritten extents will be converted to written when DIO is completed. + * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we * set up an end_io call back function, which will do the conversion * when async direct IO completed. @@ -2948,125 +2948,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; size_t count = iov_length(iov, nr_segs); - + int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; loff_t final_size = offset + count; - if (rw == WRITE && final_size <= inode->i_size) { - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - BUG_ON(iocb->private == NULL); + /* Use the old path for reads and writes beyond i_size. */ + if (rw != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + BUG_ON(iocb->private == NULL); - if (overwrite) { - atomic_inc(&inode->i_dio_count); - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as uninitialized - * to prevent parallel buffered read to expose the stale data - * before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block - * will just simply mark the buffer mapped but still - * keep the extents uninitialized. - * - * for non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * for async DIO, the conversion needs to be defered when - * the IO is completed. The ext4 end_io callback function - * will be called to take care of the conversion work. - * Here for async case, we allocate an io_end structure to - * hook to the iocb. - */ - iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = - ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; - /* - * we save the io structure for current async - * direct IO, so that later ext4_map_blocks() - * could flag the io structure whether there - * is a unwritten extents needs to be converted - * when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } + if (overwrite) { + atomic_inc(&inode->i_dio_count); + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } - if (overwrite) { - get_block_func = ext4_get_block_write_nolock; - } else { - get_block_func = ext4_get_block_write; - dio_flags = DIO_LOCKING; + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as + * uninitialized to prevent parallel buffered read to expose + * the stale data before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block will + * just simply mark the buffer mapped but still keep the + * extents uninitialized. + * + * For non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * For async DIO, the conversion needs to be deferred when the + * IO is completed. The ext4 end_io callback function will be + * called to take care of the conversion work. Here for async + * case, we allocate an io_end structure to hook to the iocb. + */ + iocb->private = NULL; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); - - if (iocb->private) - ext4_inode_aio_set(inode, NULL); + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* - * The io_end structure takes a reference to the inode, - * that structure needs to be destroyed and the - * reference to the inode need to be dropped, when IO is - * complete, even with 0 byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will be - * desctroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since - * VFS direct IO won't invoke the end_io call back function, - * we need to free the end_io structure here. + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + ext4_inode_aio_set(inode, io_end); + } - retake_lock: - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - inode_dio_done(inode); - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + + if (iocb->private) + ext4_inode_aio_set(inode, NULL); + /* + * The io_end structure takes a reference to the inode, that + * structure needs to be destroyed and the reference to the + * inode need to be dropped, when IO is complete, even with 0 + * byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will + * be destroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since VFS + * direct IO won't invoke the end_io call back function, we + * need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } - return ret; +retake_lock: + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + inode_dio_done(inode); + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); } - /* for write the the end of file case, we fall back to old way */ - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ret; } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, -- cgit v1.2.1 From aeb1e5d69a5be592e86a926be73efb38c55af404 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Nov 2012 21:21:22 -0500 Subject: ext4: fix possible use after free with metadata csum Commit fa77dcfafeaa introduces block bitmap checksum calculation into ext4_new_inode() in the case that block group was uninitialized. However we brelse() the bitmap buffer before we attempt to checksum it so we have no guarantee that the buffer is still there. Fix this by releasing the buffer after the possible checksum computation. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Acked-by: Darrick J. Wong Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3a100e7a62a8..c7efa88d7149 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -762,7 +762,6 @@ got: BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -775,6 +774,7 @@ got: ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); if (err) goto fail; -- cgit v1.2.1 From 696199f8ccf7fc6d17ef89c296ad3b6c78c52d9c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:00:51 -0500 Subject: don't do blind d_drop() in nfs_prime_dcache() Signed-off-by: Al Viro --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce8cb926526b..99489cfca24d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) nfs_refresh_inode(dentry->d_inode, entry->fattr); goto out; } else { - d_drop(dentry); + if (d_invalidate(dentry) != 0) + goto out; dput(dentry); } } -- cgit v1.2.1 From c44600c9d1de64314c2bd58103f15acb53e10073 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:04:36 -0500 Subject: nfs_lookup_revalidate(): fix a leak We are leaking fattr and fhandle if we decide that dentry is not to be invalidated, after all (e.g. happens to be a mountpoint). Just free both before that... Signed-off-by: Al Viro --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 99489cfca24d..b9e66b7e0c14 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1101,6 +1101,8 @@ out_set_verifier: out_zap_parent: nfs_zap_caches(dir); out_bad: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ @@ -1113,8 +1115,6 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, -- cgit v1.2.1 From 0903a0c8491c1e987dfc6eb294199a36760398bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:11:06 -0500 Subject: cifs: get rid of blind d_drop() in readdir Signed-off-by: Al Viro --- fs/cifs/readdir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f9b5d3d6cf33..1c576e871366 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -86,14 +86,17 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { + int err; inode = dentry->d_inode; /* update inode in place if i_ino didn't change */ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { cifs_fattr_to_inode(inode, fattr); return dentry; } - d_drop(dentry); + err = d_invalidate(dentry); dput(dentry); + if (err) + return NULL; } dentry = d_alloc(parent, name); -- cgit v1.2.1 From 21d8a15ac333b05f1fecdf9fdc30996be2e11d60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:17:21 -0500 Subject: lookup_one_len: don't accept . and .. Signed-off-by: Al Viro --- fs/namei.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 937f9d50c84b..5f4cdf3ad913 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (!len) return ERR_PTR(-EACCES); + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') -- cgit v1.2.1 From a77cfcb429ed98845a4e4df72473b8f37acd890b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:57:33 -0500 Subject: fix off-by-one in argument passed by iterate_fd() to callbacks Noticed by Pavel Roskin; the thing in his patch I disagree with was compensating for that shite in callbacks instead of fixing it once in the iterator itself. Signed-off-by: Al Viro --- fs/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 7cb71b992603..eff23162485f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -994,16 +994,18 @@ int iterate_fd(struct files_struct *files, unsigned n, const void *p) { struct fdtable *fdt; - struct file *file; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - while (!res && n < fdt->max_fds) { - file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); - if (file) - res = f(p, file, n); + for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { + struct file *file; + file = rcu_dereference_check_fdtable(files, fdt->fd[n]); + if (!file) + continue; + res = f(p, file, n); + if (res) + break; } spin_unlock(&files->file_lock); return res; -- cgit v1.2.1 From 152a7b0a808a00601328feba2001cbb2b530f771 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 2 Dec 2012 11:13:24 -0500 Subject: ext4: move extra inode read to a new function Currently, in ext4_iget we do a simple check to see whether there does exist some information starting from the end of i_extra_size. With inline data added, this procedure is more complicated. So move it to a new function named ext4_iget_extra_inode. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 91a24967b8ae..befa005711a1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3700,6 +3700,16 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } +static inline void ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_set_inode_state(inode, EXT4_STATE_XATTR); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -3842,11 +3852,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_iget_extra_inode(inode, raw_inode, ei); } } -- cgit v1.2.1 From e5f9570319771bb0a5afc792b34fbd5564b935c8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 30 Nov 2012 17:24:18 -0500 Subject: nfsd4: discard some unused nfsd4_verify xdr code Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b775366a0a68..3bf8a9d7f217 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1106,31 +1106,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s static __be32 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) { -#if 0 - struct nfsd4_compoundargs save = { - .p = argp->p, - .end = argp->end, - .rqstp = argp->rqstp, - }; - u32 ve_bmval[2]; - struct iattr ve_iattr; /* request */ - struct nfs4_acl *ve_acl; /* request */ -#endif DECODE_HEAD; if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) goto out; /* For convenience's sake, we compare raw xdr'd attributes in - * nfsd4_proc_verify; however we still decode here just to return - * correct error in case of bad xdr. */ -#if 0 - status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl); - if (status == nfserr_inval) { - status = nfserrno(status); - goto out; - } -#endif + * nfsd4_proc_verify */ + READ_BUF(4); READ32(verify->ve_attrlen); READ_BUF(verify->ve_attrlen); -- cgit v1.2.1 From 043958395a6b91863046b0cd7cae9c67fa845144 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:38 -0500 Subject: NFSD: Lock state before calling fault injection function Each function touches state in some way, so getting the lock earlier can help simplify code. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 2 ++ fs/nfsd/nfs4state.c | 18 ++---------------- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 02781121c6b0..4b385a14cf96 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -51,7 +51,9 @@ static int nfsd_inject_set(void *op_ptr, u64 val) else printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); + nfs4_lock_state(); op->func(val); + nfs4_unlock_state(); return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b1aa577dd869..8e19c692649c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4611,13 +4611,11 @@ void nfsd_forget_clients(u64 num) int count = 0; struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); - nfs4_lock_state(); list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { expire_client(clp); if (++count == num) break; } - nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d clients", count); } @@ -4653,25 +4651,15 @@ static int nfsd_release_n_owners(u64 num, bool is_open_owner, void nfsd_forget_locks(u64 num) { - int count; struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - - nfs4_lock_state(); - count = nfsd_release_n_owners(num, false, release_lockowner_sop, nn); - nfs4_unlock_state(); - + int count = nfsd_release_n_owners(num, false, release_lockowner_sop, nn); printk(KERN_INFO "NFSD: Forgot %d locks", count); } void nfsd_forget_openowners(u64 num) { - int count; struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - - nfs4_lock_state(); - count = nfsd_release_n_owners(num, true, release_openowner_sop, nn); - nfs4_unlock_state(); - + int count = nfsd_release_n_owners(num, true, release_openowner_sop, nn); printk(KERN_INFO "NFSD: Forgot %d open owners", count); } @@ -4704,10 +4692,8 @@ void nfsd_forget_delegations(u64 num) count = nfsd_process_n_delegations(num, &victims); spin_unlock(&recall_lock); - nfs4_lock_state(); list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) unhash_delegation(dp); - nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d delegations", count); } -- cgit v1.2.1 From 44e34da60b24ca14666534b61cc9579aa4e1eac5 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:39 -0500 Subject: NFSD: Clean up forgetting clients I added in a generic for-each loop that takes a pass over the client_lru list for the current net namespace and calls some function. The next few patches will update other operations to use this function as well. A value of 0 still means "forget everything that is found". Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 3 +++ fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 2c4b2e2896dd..964b5542f027 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -92,5 +92,8 @@ struct nfsd_net { time_t nfsd4_grace; }; +/* Simple check to find out if a given net was properly initialized */ +#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) + extern int nfsd_net_id; #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8e19c692649c..2478c8996bda 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4605,19 +4605,34 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) #ifdef CONFIG_NFSD_FAULT_INJECTION -void nfsd_forget_clients(u64 num) +u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) +{ + expire_client(clp); + return 1; +} + +u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) { struct nfs4_client *clp, *next; - int count = 0; + u64 count = 0; struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + if (!nfsd_netns_ready(nn)) + return 0; + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - expire_client(clp); - if (++count == num) + count += func(clp, max - count); + if ((max != 0) && (count >= max)) break; } - printk(KERN_INFO "NFSD: Forgot %d clients", count); + return count; +} + +void nfsd_forget_clients(u64 num) +{ + u64 count = nfsd_for_n_state(num, nfsd_forget_client); + printk(KERN_INFO "NFSD: Forgot %llu clients", count); } static void release_lockowner_sop(struct nfs4_stateowner *sop) -- cgit v1.2.1 From fc29171f5b3257694bf508cf4ae51970c97af78c Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:40 -0500 Subject: NFSD: Clean up forgetting locks I use the new "forget_n_state()" function to iterate through each client first when searching for locks. This may slow down forgetting locks a little bit, but it implements most of the code needed to forget a specified client's locks. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2478c8996bda..46bece40b1ce 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4611,6 +4611,32 @@ u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) return 1; } +static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *)) +{ + struct nfs4_openowner *oop; + struct nfs4_lockowner *lop, *lo_next; + struct nfs4_ol_stateid *stp, *st_next; + u64 count = 0; + + list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { + list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) { + list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) { + if (func) + func(lop); + if (++count == max) + return count; + } + } + } + + return count; +} + +u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) +{ + return nfsd_foreach_client_lock(clp, max, release_lockowner); +} + u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) { struct nfs4_client *clp, *next; @@ -4635,11 +4661,6 @@ void nfsd_forget_clients(u64 num) printk(KERN_INFO "NFSD: Forgot %llu clients", count); } -static void release_lockowner_sop(struct nfs4_stateowner *sop) -{ - release_lockowner(lockowner(sop)); -} - static void release_openowner_sop(struct nfs4_stateowner *sop) { release_openowner(openowner(sop)); @@ -4666,9 +4687,8 @@ static int nfsd_release_n_owners(u64 num, bool is_open_owner, void nfsd_forget_locks(u64 num) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - int count = nfsd_release_n_owners(num, false, release_lockowner_sop, nn); - printk(KERN_INFO "NFSD: Forgot %d locks", count); + u64 count = nfsd_for_n_state(num, nfsd_forget_client_locks); + printk(KERN_INFO "NFSD: Forgot %llu locks", count); } void nfsd_forget_openowners(u64 num) -- cgit v1.2.1 From 4dbdbda84f963312e0b5dfdf2dfbf64de047dd44 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:41 -0500 Subject: NFSD: Clean up forgetting openowners Using "forget_n_state()" forces me to implement the code needed to forget a specific client's openowners. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 46bece40b1ce..00d4398e2324 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4637,6 +4637,26 @@ u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) return nfsd_foreach_client_lock(clp, max, release_lockowner); } +static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *)) +{ + struct nfs4_openowner *oop, *next; + u64 count = 0; + + list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { + if (func) + func(oop); + if (++count == max) + break; + } + + return count; +} + +u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) +{ + return nfsd_foreach_client_open(clp, max, release_openowner); +} + u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) { struct nfs4_client *clp, *next; @@ -4661,30 +4681,6 @@ void nfsd_forget_clients(u64 num) printk(KERN_INFO "NFSD: Forgot %llu clients", count); } -static void release_openowner_sop(struct nfs4_stateowner *sop) -{ - release_openowner(openowner(sop)); -} - -static int nfsd_release_n_owners(u64 num, bool is_open_owner, - void (*release_sop)(struct nfs4_stateowner *), - struct nfsd_net *nn) -{ - int i, count = 0; - struct nfs4_stateowner *sop, *next; - - for (i = 0; i < OWNER_HASH_SIZE; i++) { - list_for_each_entry_safe(sop, next, &nn->ownerstr_hashtbl[i], so_strhash) { - if (sop->so_is_open_owner != is_open_owner) - continue; - release_sop(sop); - if (++count == num) - return count; - } - } - return count; -} - void nfsd_forget_locks(u64 num) { u64 count = nfsd_for_n_state(num, nfsd_forget_client_locks); @@ -4693,9 +4689,8 @@ void nfsd_forget_locks(u64 num) void nfsd_forget_openowners(u64 num) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - int count = nfsd_release_n_owners(num, true, release_openowner_sop, nn); - printk(KERN_INFO "NFSD: Forgot %d open owners", count); + u64 count = nfsd_for_n_state(num, nfsd_forget_client_openowners); + printk(KERN_INFO "NFSD: Forgot %llu open owners", count); } static int nfsd_process_n_delegations(u64 num, struct list_head *list) -- cgit v1.2.1 From 269de30f10604710dde8d544748b5b6c748b7de8 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:42 -0500 Subject: NFSD: Clean up forgetting and recalling delegations Once I have a client, I can easily use its delegation list rather than searching the file hash table for delegations to remove. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 94 ++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 00d4398e2324..dc7c22f14fef 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4657,6 +4657,52 @@ u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) return nfsd_foreach_client_open(clp, max, release_openowner); } +static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, + struct list_head *victims) +{ + struct nfs4_delegation *dp, *next; + u64 count = 0; + + list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { + if (victims) + list_move(&dp->dl_recall_lru, victims); + if (++count == max) + break; + } + return count; +} + +u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) +{ + struct nfs4_delegation *dp, *next; + LIST_HEAD(victims); + u64 count; + + spin_lock(&recall_lock); + count = nfsd_find_all_delegations(clp, max, &victims); + spin_unlock(&recall_lock); + + list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) + unhash_delegation(dp); + + return count; +} + +u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) +{ + struct nfs4_delegation *dp, *next; + LIST_HEAD(victims); + u64 count; + + spin_lock(&recall_lock); + count = nfsd_find_all_delegations(clp, max, &victims); + list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) + nfsd_break_one_deleg(dp); + spin_unlock(&recall_lock); + + return count; +} + u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) { struct nfs4_client *clp, *next; @@ -4693,56 +4739,16 @@ void nfsd_forget_openowners(u64 num) printk(KERN_INFO "NFSD: Forgot %llu open owners", count); } -static int nfsd_process_n_delegations(u64 num, struct list_head *list) -{ - int i, count = 0; - struct nfs4_file *fp, *fnext; - struct nfs4_delegation *dp, *dnext; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { - list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { - list_move(&dp->dl_recall_lru, list); - if (++count == num) - return count; - } - } - } - - return count; -} - void nfsd_forget_delegations(u64 num) { - unsigned int count; - LIST_HEAD(victims); - struct nfs4_delegation *dp, *dnext; - - spin_lock(&recall_lock); - count = nfsd_process_n_delegations(num, &victims); - spin_unlock(&recall_lock); - - list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) - unhash_delegation(dp); - - printk(KERN_INFO "NFSD: Forgot %d delegations", count); + u64 count = nfsd_for_n_state(num, nfsd_forget_client_delegations); + printk(KERN_INFO "NFSD: Forgot %llu delegations", count); } void nfsd_recall_delegations(u64 num) { - unsigned int count; - LIST_HEAD(victims); - struct nfs4_delegation *dp, *dnext; - - spin_lock(&recall_lock); - count = nfsd_process_n_delegations(num, &victims); - list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { - list_del(&dp->dl_recall_lru); - nfsd_break_one_deleg(dp); - } - spin_unlock(&recall_lock); - - printk(KERN_INFO "NFSD: Recalled %d delegations", count); + u64 count = nfsd_for_n_state(num, nfsd_recall_client_delegations); + printk(KERN_INFO "NFSD: Recalled %llu delegations", count); } #endif /* CONFIG_NFSD_FAULT_INJECTION */ -- cgit v1.2.1 From 8ce54e0d82730ece61737c9fd7b61b28ab8c3390 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:43 -0500 Subject: NFSD: Fault injection operations take a per-client forget function The eventual goal is to forget state based on ip address, so it makes sense to call this function in a for-each-client loop until the correct amount of state is forgotten. I also use this patch as an opportunity to rename the forget function from "func()" to "forget()". Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 16 +++++++++------- fs/nfsd/nfs4state.c | 30 ------------------------------ fs/nfsd/state.h | 12 +++++++----- 3 files changed, 16 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 4b385a14cf96..bf6161adf663 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -13,29 +13,29 @@ struct nfsd_fault_inject_op { char *file; - void (*func)(u64); + u64 (*forget)(struct nfs4_client *, u64); }; static struct nfsd_fault_inject_op inject_ops[] = { { .file = "forget_clients", - .func = nfsd_forget_clients, + .forget = nfsd_forget_client, }, { .file = "forget_locks", - .func = nfsd_forget_locks, + .forget = nfsd_forget_client_locks, }, { .file = "forget_openowners", - .func = nfsd_forget_openowners, + .forget = nfsd_forget_client_openowners, }, { .file = "forget_delegations", - .func = nfsd_forget_delegations, + .forget = nfsd_forget_client_delegations, }, { .file = "recall_delegations", - .func = nfsd_recall_delegations, + .forget = nfsd_recall_client_delegations, }, }; @@ -44,6 +44,7 @@ static struct dentry *debug_dir; static int nfsd_inject_set(void *op_ptr, u64 val) { + u64 count = 0; struct nfsd_fault_inject_op *op = op_ptr; if (val == 0) @@ -52,8 +53,9 @@ static int nfsd_inject_set(void *op_ptr, u64 val) printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); nfs4_lock_state(); - op->func(val); + count = nfsd_for_n_state(val, op->forget); nfs4_unlock_state(); + printk(KERN_INFO "NFSD: %s: found %llu", op->file, count); return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index dc7c22f14fef..ab45cdd7b3da 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4721,36 +4721,6 @@ u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) return count; } -void nfsd_forget_clients(u64 num) -{ - u64 count = nfsd_for_n_state(num, nfsd_forget_client); - printk(KERN_INFO "NFSD: Forgot %llu clients", count); -} - -void nfsd_forget_locks(u64 num) -{ - u64 count = nfsd_for_n_state(num, nfsd_forget_client_locks); - printk(KERN_INFO "NFSD: Forgot %llu locks", count); -} - -void nfsd_forget_openowners(u64 num) -{ - u64 count = nfsd_for_n_state(num, nfsd_forget_client_openowners); - printk(KERN_INFO "NFSD: Forgot %llu open owners", count); -} - -void nfsd_forget_delegations(u64 num) -{ - u64 count = nfsd_for_n_state(num, nfsd_forget_client_delegations); - printk(KERN_INFO "NFSD: Forgot %llu delegations", count); -} - -void nfsd_recall_delegations(u64 num) -{ - u64 count = nfsd_for_n_state(num, nfsd_recall_client_delegations); - printk(KERN_INFO "NFSD: Recalled %llu delegations", count); -} - #endif /* CONFIG_NFSD_FAULT_INJECTION */ /* initialization to perform at module load time: */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b542bf2c0fe7..423ac64ceb74 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -501,11 +501,13 @@ extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); #ifdef CONFIG_NFSD_FAULT_INJECTION int nfsd_fault_inject_init(void); void nfsd_fault_inject_cleanup(void); -void nfsd_forget_clients(u64); -void nfsd_forget_locks(u64); -void nfsd_forget_openowners(u64); -void nfsd_forget_delegations(u64); -void nfsd_recall_delegations(u64); +u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); + +u64 nfsd_forget_client(struct nfs4_client *, u64); +u64 nfsd_forget_client_locks(struct nfs4_client*, u64); +u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); +u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); +u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); #else /* CONFIG_NFSD_FAULT_INJECTION */ static inline int nfsd_fault_inject_init(void) { return 0; } static inline void nfsd_fault_inject_cleanup(void) {} -- cgit v1.2.1 From 184c18471f7d0963ad5752692c4b441a546d88f1 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:44 -0500 Subject: NFSD: Reading a fault injection file prints a state count I also log basic information that I can figure out about the type of state (such as number of locks for each client IP address). This can be useful for checking that state was actually dropped and later for checking if the client was able to recover. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 13 +++++++++++-- fs/nfsd/nfs4state.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 5 +++++ 3 files changed, 58 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index bf6161adf663..545f8e4ed101 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -14,28 +14,34 @@ struct nfsd_fault_inject_op { char *file; u64 (*forget)(struct nfs4_client *, u64); + u64 (*print)(struct nfs4_client *, u64); }; static struct nfsd_fault_inject_op inject_ops[] = { { .file = "forget_clients", .forget = nfsd_forget_client, + .print = nfsd_print_client, }, { .file = "forget_locks", .forget = nfsd_forget_client_locks, + .print = nfsd_print_client_locks, }, { .file = "forget_openowners", .forget = nfsd_forget_client_openowners, + .print = nfsd_print_client_openowners, }, { .file = "forget_delegations", .forget = nfsd_forget_client_delegations, + .print = nfsd_print_client_delegations, }, { .file = "recall_delegations", .forget = nfsd_recall_client_delegations, + .print = nfsd_print_client_delegations, }, }; @@ -59,9 +65,12 @@ static int nfsd_inject_set(void *op_ptr, u64 val) return 0; } -static int nfsd_inject_get(void *data, u64 *val) +static int nfsd_inject_get(void *op_ptr, u64 *val) { - *val = 0; + struct nfsd_fault_inject_op *op = op_ptr; + nfs4_lock_state(); + *val = nfsd_for_n_state(0, op->print); + nfs4_unlock_state(); return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ab45cdd7b3da..9fb8e52580f3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4611,6 +4611,22 @@ u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) return 1; } +u64 nfsd_print_client(struct nfs4_client *clp, u64 num) +{ + char buf[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + printk(KERN_INFO "NFS Client: %s\n", buf); + return 1; +} + +static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, + const char *type) +{ + char buf[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); +} + static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *)) { struct nfs4_openowner *oop; @@ -4637,6 +4653,13 @@ u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) return nfsd_foreach_client_lock(clp, max, release_lockowner); } +u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max) +{ + u64 count = nfsd_foreach_client_lock(clp, max, NULL); + nfsd_print_count(clp, count, "locked files"); + return count; +} + static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *)) { struct nfs4_openowner *oop, *next; @@ -4657,6 +4680,13 @@ u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) return nfsd_foreach_client_open(clp, max, release_openowner); } +u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max) +{ + u64 count = nfsd_foreach_client_open(clp, max, NULL); + nfsd_print_count(clp, count, "open files"); + return count; +} + static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, struct list_head *victims) { @@ -4703,6 +4733,18 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) return count; } +u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) +{ + u64 count = 0; + + spin_lock(&recall_lock); + count = nfsd_find_all_delegations(clp, max, NULL); + spin_unlock(&recall_lock); + + nfsd_print_count(clp, count, "delegations"); + return count; +} + u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) { struct nfs4_client *clp, *next; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 423ac64ceb74..4017f3553a63 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -508,6 +508,11 @@ u64 nfsd_forget_client_locks(struct nfs4_client*, u64); u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); + +u64 nfsd_print_client(struct nfs4_client *, u64); +u64 nfsd_print_client_locks(struct nfs4_client *, u64); +u64 nfsd_print_client_openowners(struct nfs4_client *, u64); +u64 nfsd_print_client_delegations(struct nfs4_client *, u64); #else /* CONFIG_NFSD_FAULT_INJECTION */ static inline int nfsd_fault_inject_init(void) { return 0; } static inline void nfsd_fault_inject_cleanup(void) {} -- cgit v1.2.1 From d7cc431edd0a6c69a88b5ff1e304af50bfb2270e Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:45 -0500 Subject: NFSD: Add a custom file operations structure for fault injection Controlling the read and write functions allows me to add in "forget client w.x.y.z", since we won't be limited to reading and writing only u64 values. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 56 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 545f8e4ed101..19f9094bbb07 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "state.h" @@ -48,10 +49,9 @@ static struct nfsd_fault_inject_op inject_ops[] = { static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); static struct dentry *debug_dir; -static int nfsd_inject_set(void *op_ptr, u64 val) +static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val) { u64 count = 0; - struct nfsd_fault_inject_op *op = op_ptr; if (val == 0) printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); @@ -62,19 +62,61 @@ static int nfsd_inject_set(void *op_ptr, u64 val) count = nfsd_for_n_state(val, op->forget); nfs4_unlock_state(); printk(KERN_INFO "NFSD: %s: found %llu", op->file, count); - return 0; } -static int nfsd_inject_get(void *op_ptr, u64 *val) +static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val) { - struct nfsd_fault_inject_op *op = op_ptr; nfs4_lock_state(); *val = nfsd_for_n_state(0, op->print); nfs4_unlock_state(); - return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); +static ssize_t fault_inject_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + static u64 val; + char read_buf[25]; + size_t size, ret; + loff_t pos = *ppos; + + if (!pos) + nfsd_inject_get(file->f_dentry->d_inode->i_private, &val); + size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); + + if (pos < 0) + return -EINVAL; + if (pos >= size || !len) + return 0; + if (len > size - pos) + len = size - pos; + ret = copy_to_user(buf, read_buf + pos, len); + if (ret == len) + return -EFAULT; + len -= ret; + *ppos = pos + len; + return len; +} + +static ssize_t fault_inject_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + char write_buf[24]; + size_t size = min(sizeof(write_buf), len) - 1; + u64 val; + + if (copy_from_user(write_buf, buf, size)) + return -EFAULT; + + val = simple_strtoll(write_buf, NULL, 0); + nfsd_inject_set(file->f_dentry->d_inode->i_private, val); + return len; /* on success, claim we got the whole input */ +} + +static const struct file_operations fops_nfsd = { + .owner = THIS_MODULE, + .read = fault_inject_read, + .write = fault_inject_write, +}; void nfsd_fault_inject_cleanup(void) { -- cgit v1.2.1 From 6c1e82a4b74ad0c8b45c833a4409f153199d9be4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 29 Nov 2012 11:40:46 -0500 Subject: NFSD: Forget state for a specific client Write the client's ip address to any state file and all appropriate state for that client will be forgotten. Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 37 +++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 15 +++++++++++++++ fs/nfsd/state.h | 1 + 3 files changed, 49 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 19f9094bbb07..96ffdf55dcec 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -8,9 +8,12 @@ #include #include #include +#include +#include #include #include "state.h" +#include "netns.h" struct nfsd_fault_inject_op { char *file; @@ -64,6 +67,24 @@ static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val) printk(KERN_INFO "NFSD: %s: found %llu", op->file, count); } +static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op, + struct sockaddr_storage *addr, + size_t addr_size) +{ + char buf[INET6_ADDRSTRLEN]; + struct nfs4_client *clp; + u64 count; + + nfs4_lock_state(); + clp = nfsd_find_client(addr, addr_size); + if (clp) { + count = op->forget(clp, 0); + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count); + } + nfs4_unlock_state(); +} + static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val) { nfs4_lock_state(); @@ -100,15 +121,23 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, static ssize_t fault_inject_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - char write_buf[24]; + char write_buf[INET6_ADDRSTRLEN]; size_t size = min(sizeof(write_buf), len) - 1; + struct net *net = current->nsproxy->net_ns; + struct sockaddr_storage sa; u64 val; if (copy_from_user(write_buf, buf, size)) return -EFAULT; - - val = simple_strtoll(write_buf, NULL, 0); - nfsd_inject_set(file->f_dentry->d_inode->i_private, val); + write_buf[size] = '\0'; + + size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); + if (size > 0) + nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size); + else { + val = simple_strtoll(write_buf, NULL, 0); + nfsd_inject_set(file->f_dentry->d_inode->i_private, val); + } return len; /* on success, claim we got the whole input */ } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9fb8e52580f3..eff734033437 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4763,6 +4763,21 @@ u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) return count; } +struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) +{ + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return NULL; + + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + if (memcmp(&clp->cl_addr, addr, addr_size) == 0) + return clp; + } + return NULL; +} + #endif /* CONFIG_NFSD_FAULT_INJECTION */ /* initialization to perform at module load time: */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4017f3553a63..d1c229feed52 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -502,6 +502,7 @@ extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); int nfsd_fault_inject_init(void); void nfsd_fault_inject_cleanup(void); u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); +struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t); u64 nfsd_forget_client(struct nfs4_client *, u64); u64 nfsd_forget_client_locks(struct nfs4_client*, u64); -- cgit v1.2.1 From f9668a09e32ac6d2aa22f44cc310e430a8f4a40f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:03 +1100 Subject: xfs: fix sparse reported log CRC endian issue Not a bug as such, just warning noise from the xlog_cksum() returning a __be32 type when it should be returning a __le32 type. On Wed, Nov 28, 2012 at 08:30:59AM -0500, Christoph Hellwig wrote: > But why are we storing the crc field little endian while all other on > disk formats are big endian? (And yes I realize it might as well have > been me who did that back in the idea, but I still have no idea why) Because the CRC always returns the calcuation LE format, even on BE systems. So rather than always having to byte swap it everywhere and have all the force casts and anootations for sparse, it seems simpler to just make it a __le32 everywhere.... Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_log_recover.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c49e2c12dba4..46bd9d52ab51 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1538,7 +1538,7 @@ xlog_pack_data( * This is a little more complicated than it should be because the various * headers and the actual data are non-contiguous. */ -__be32 +__le32 xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index dc3498bf17c2..16d8d12ea3b4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -555,7 +555,7 @@ extern int xlog_recover_finish( struct xlog *log); -extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9c3651c9e75b..96fcbb85ff83 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3233,15 +3233,15 @@ xlog_unpack_data_crc( xfs_caddr_t dp, struct xlog *log) { - __be32 crc; + __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.\n", - be32_to_cpu(rhead->h_crc), - be32_to_cpu(crc)); + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } -- cgit v1.2.1 From 9b2ef62b1541f176ea1b1f6e13b16df14bb16e99 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 3 Dec 2012 17:24:41 -0500 Subject: nfsd4: lockt, release_lockowner should renew clients Fix nfsd4_lockt and release_lockowner to lookup the referenced client, so that it can renew it, or correctly return "expired", as appropriate. Also share some code while we're here. Reported-by: Frank Filz Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index eff734033437..16e954c1c911 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3132,6 +3132,18 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) free_generic_stateid(open->op_stp); } +static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp) +{ + struct nfs4_client *found; + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + found = find_confirmed_client(clid, session, nn); + if (clp) + *clp = found; + return found ? nfs_ok : nfserr_expired; +} + __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) @@ -3143,16 +3155,9 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", clid->cl_boot, clid->cl_id); - status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid, nn)) - goto out; - clp = find_confirmed_client(clid, cstate->minorversion, nn); - status = nfserr_expired; - if (clp == NULL) { - /* We assume the client took too long to RENEW. */ - dprintk("nfsd4_renew: clientid not found!\n"); + status = lookup_clientid(clid, cstate->minorversion, nn, &clp); + if (status) goto out; - } status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) @@ -4293,9 +4298,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - status = nfserr_stale_clientid; - if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) - goto out; + if (!nfsd4_has_session(cstate)) { + status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL); + if (status) + goto out; + } if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; @@ -4466,14 +4473,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - /* XXX check for lease expiration */ - - status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid, nn)) - return status; - nfs4_lock_state(); + status = lookup_clientid(clid, cstate->minorversion, nn, NULL); + if (status) + goto out; + status = nfserr_locks_held; INIT_LIST_HEAD(&matches); -- cgit v1.2.1 From 57302e0ddf8a210a66fd8a1a2fa50844863b5ded Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Dec 2012 08:25:11 -0800 Subject: vfs: avoid "attempt to access beyond end of device" warnings The block device access simplification that avoided accessing the (racy) block size information (commit bbec0270bdd8: "blkdev_max_block: make private to fs/buffer.c") no longer checks the maximum block size in the block mapping path. That was _almost_ as simple as just removing the code entirely, because the readers and writers all check the size of the device anyway, so under normal circumstances it "just worked". However, the block size may be such that the end of the device may straddle one single buffer_head. At which point we may still want to access the end of the device, but the buffer we use to access it partially extends past the end. The 'bd_set_size()' function intentionally sets the block size to avoid this, but mounting the device - or setting the block size by hand to some other value - can modify that block size. So instead, teach 'submit_bh()' about the special case of the buffer head straddling the end of the device, and turning such an access into a smaller IO access, avoiding the problem. This, btw, also means that unlike before, we can now access the whole device regardless of device block size setting. So now, even if the device size is only 512-byte aligned, we can read and write even the last sector even when having a much bigger block size for accessing the rest of the device. So with this, we could now get rid of the 'bd_set_size()' block size code entirely - resulting in faster IO for the common case - but that would be a separate patch. Reported-and-tested-by: Romain Francoise Reporeted-and-tested-by: Meelis Roos Reported-by: Tony Luck Signed-off-by: Linus Torvalds --- fs/buffer.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3586fb05c8ce..c4e11390a44c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,6 +2893,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) bio_put(bio); } +/* + * This allows us to do IO even on the odd last sectors + * of a device, even if the bh block size is some multiple + * of the physical sector size. + * + * We'll just truncate the bio to the size of the device, + * and clear the end of the buffer head manually. + * + * Truly out-of-range accesses will turn into actual IO + * errors, this only handles the "we need to be able to + * do IO at the final sector" case. + */ +static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +{ + sector_t maxsector; + unsigned bytes; + + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_sector; + bytes = bio->bi_size; + if (likely((bytes >> 9) <= maxsector)) + return; + + /* Uhhuh. We've got a bh that straddles the device size! */ + bytes = maxsector << 9; + + /* Truncate the bio.. */ + bio->bi_size = bytes; + bio->bi_io_vec[0].bv_len = bytes; + + /* ..and clear the end of the buffer for reads */ + if (rw & READ) { + void *kaddr = kmap_atomic(bh->b_page); + memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); + kunmap_atomic(kaddr); + } +} + int submit_bh(int rw, struct buffer_head * bh) { struct bio *bio; @@ -2929,6 +2978,9 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + /* Take care of bh's that straddle the end of the device */ + guard_bh_eod(rw, bio, bh); + bio_get(bio); submit_bio(rw, bio); -- cgit v1.2.1 From 879b38257bf2b6fa8406693a3b5b5a0649e7c594 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 5 Dec 2012 10:28:46 -0500 Subject: ext4: export inline xattr functions The inline data feature will need some inline xattr functions, so export them from fs/ext4/xattr.c so that inline.c can use them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 39 ++++++-------------------------------- fs/ext4/xattr.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b1adda1b750d..a47dc3883a23 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,11 +61,6 @@ #include "xattr.h" #include "acl.h" -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -312,7 +307,7 @@ cleanup: return error; } -static int +int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { @@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) { @@ -949,14 +929,8 @@ bad_block: #undef header } -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; - -static int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; @@ -984,10 +958,9 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -static int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 91f31ca7d9af..40ca7a6f5eec 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -65,6 +65,32 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + # ifdef CONFIG_EXT4_FS_XATTR extern const struct xattr_handler ext4_xattr_user_handler; @@ -90,6 +116,15 @@ extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -143,6 +178,29 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #define ext4_xattr_handlers NULL +static inline int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} + # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 27d7c2a006a81c04fab00b8cd81b99af3b32738d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Dec 2012 20:01:24 +0300 Subject: vfs: clear to the end of the buffer on partial buffer reads READ is zero so the "rw & READ" test is always false. The intended test was "((rw & RW_MASK) == READ)". Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c4e11390a44c..ec0aca8ba6bf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2935,7 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) bio->bi_io_vec[0].bv_len = bytes; /* ..and clear the end of the buffer for reads */ - if (rw & READ) { + if ((rw & RW_MASK) == READ) { void *kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); kunmap_atomic(kaddr); -- cgit v1.2.1 From 81bcd8b795229c70d7244898efe282846e3b14ce Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 25 Nov 2012 00:07:44 -0600 Subject: default authentication needs to be at least ntlmv2 security for cifs mounts We had planned to upgrade to ntlmv2 security a few releases ago, and have been warning users in dmesg on mount about the impending upgrade, but had to make a change (to use nltmssp with ntlmv2) due to testing issues with some non-Windows, non-Samba servers. The approach in this patch is simpler than earlier patches, and changes the default authentication mechanism to ntlmv2 password hashes (encapsulated in ntlmssp) from ntlm (ntlm is too weak for current use and ntlmv2 has been broadly supported for many, many years). Signed-off-by: Steve French Acked-by: Jeff Layton --- fs/cifs/cifsglob.h | 2 +- fs/cifs/connect.c | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f5af2527fc69..2cd5ea2042ed 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1362,7 +1362,7 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) #define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5c670b998ffb..32fb50e7932b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2397,8 +2397,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), } #endif /* CONFIG_KEYS */ -static bool warned_on_ntlm; /* globals init to false automatically */ - static struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { @@ -2475,14 +2473,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; - /* ntlmv2 is much stronger than ntlm security, and has been broadly - supported for many years, time to update default security mechanism */ - if ((volume_info->secFlg == 0) && warned_on_ntlm == false) { - warned_on_ntlm = true; - cERROR(1, "default security mechanism requested. The default " - "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.3"); - } ses->overrideSecFlg = volume_info->secFlg; mutex_lock(&ses->session_mutex); -- cgit v1.2.1 From 60654ce047f7be62afa291573501e011297a47d8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:34 -0500 Subject: cifs: fix types on module parameters Most of these are unsigned ints, so we should be passing "uint" to module_param. Also, get rid of the extra "(bool)" in the description of enable_oplocks. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e7931cc55d0c..07a8ab527c3a 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, int, 0); +module_param(CIFSMaxBufSize, uint, 0); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, int, 0); +module_param(cifs_min_rcv, uint, 0); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, int, 0); +module_param(cifs_min_small, uint, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0444); +module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 32767 Range: 2 to 32767."); module_param(enable_oplocks, bool, 0644); -MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" - "y/Y/1"); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; -- cgit v1.2.1 From c78cd83805d43198e1ef452fba27fa049db6387f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:35 -0500 Subject: cifs: clean up id_mode_to_cifs_acl Add a label we can goto on error, and get rid of some excess indentation. Also move to kernel-style comments. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 0fb15bbbe43c..b45ec7426ae3 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1307,42 +1307,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, /* Get the security descriptor */ pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); - - /* Add three ACEs for owner, group, everyone getting rid of - other ACEs as chmod disables ACEs and set the security descriptor */ - if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cERROR(1, "%s: error %d getting sec desc", __func__, rc); - } else { - /* allocate memory for the smb header, - set security descriptor request security descriptor - parameters, and secuirty descriptor itself */ - - secdesclen = secdesclen < DEFSECDESCLEN ? - DEFSECDESCLEN : secdesclen; - pnntsd = kmalloc(secdesclen, GFP_KERNEL); - if (!pnntsd) { - cERROR(1, "Unable to allocate security descriptor"); - kfree(pntsd); - return -ENOMEM; - } + goto out; + } - rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, - &aclflag); + /* + * Add three ACEs for owner, group, everyone getting rid of other ACEs + * as chmod disables ACEs and set the security descriptor. Allocate + * memory for the smb header, set security descriptor request security + * descriptor parameters, and secuirty descriptor itself + */ + secdesclen = max_t(u32, secdesclen, DEFSECDESCLEN); + pnntsd = kmalloc(secdesclen, GFP_KERNEL); + if (!pnntsd) { + cERROR(1, "Unable to allocate security descriptor"); + kfree(pntsd); + return -ENOMEM; + } - cFYI(DBG2, "build_sec_desc rc: %d", rc); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); - if (!rc) { - /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, - path, aclflag); - cFYI(DBG2, "set_cifs_acl rc: %d", rc); - } + cFYI(DBG2, "build_sec_desc rc: %d", rc); - kfree(pnntsd); - kfree(pntsd); + if (!rc) { + /* Set the security descriptor */ + rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + cFYI(DBG2, "set_cifs_acl rc: %d", rc); } + kfree(pnntsd); + kfree(pntsd); +out: return rc; } -- cgit v1.2.1 From fc03d8a5a18172ebdb2402cc355abb8fd3cbb844 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:35 -0500 Subject: cifs: move num_subauth check inside of CONFIG_CIFS_DEBUG2 check in parse_sid() Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index b45ec7426ae3..d35579a1640a 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -987,8 +987,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) return -EINVAL; } - if (psid->num_subauth) { #ifdef CONFIG_CIFS_DEBUG2 + if (psid->num_subauth) { int i; cFYI(1, "SID revision %d num_auth %d", psid->revision, psid->num_subauth); @@ -1002,8 +1002,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) num auths and therefore go off the end */ cFYI(1, "RID 0x%x", le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); -#endif } +#endif return 0; } -- cgit v1.2.1 From 852e22950dc47e774bb602b16f55fed42afac5fb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:36 -0500 Subject: cifs: use the NUM_AUTHS and NUM_SUBAUTHS constants in cifsacl code ...instead of hardcoding in '5' and '6' all over the place. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 6 +++--- fs/cifs/cifsacl.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d35579a1640a..18437c5561fe 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -210,7 +210,7 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr) sprintf(strptr, "-%d", sidptr->revision); strptr = sidstr + strlen(sidstr); - for (i = 0; i < 6; ++i) { + for (i = 0; i < NUM_AUTHS; ++i) { if (sidptr->authority[i]) { sprintf(strptr, "-%d", sidptr->authority[i]); strptr = sidstr + strlen(sidstr); @@ -649,7 +649,7 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) } /* compare all of the six auth values */ - for (i = 0; i < 6; ++i) { + for (i = 0; i < NUM_AUTHS; ++i) { if (ctsid->authority[i] != cwsid->authority[i]) { if (ctsid->authority[i] > cwsid->authority[i]) return 1; @@ -811,7 +811,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace, pntace->sid.revision = psid->revision; pntace->sid.num_subauth = psid->num_subauth; - for (i = 0; i < 6; i++) + for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = psid->authority[i]; for (i = 0; i < psid->num_subauth; i++) pntace->sid.sub_auth[i] = psid->sub_auth[i]; diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 5c902c7ce524..80e0d66a403d 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -60,8 +60,8 @@ struct cifs_ntsd { struct cifs_sid { __u8 revision; /* revision level */ __u8 num_subauth; - __u8 authority[6]; - __le32 sub_auth[5]; /* sub_auth[num_subauth] */ + __u8 authority[NUM_AUTHS]; + __le32 sub_auth[NUM_SUBAUTHS]; /* sub_auth[num_subauth] */ } __attribute__((packed)); struct cifs_acl { -- cgit v1.2.1 From 436bb435fcbe2d52678ec7e2abc45fd1938601ce Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:36 -0500 Subject: cifs: make compare_sids static ..nothing outside of cifsacl.c calls it. Also fix the incorrect comment on the function. It returns 0 when they match. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 98 ++++++++++++++++++++++++++++--------------------------- fs/cifs/cifsacl.h | 2 -- 2 files changed, 50 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 18437c5561fe..5a312eb45a92 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -224,6 +224,56 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr) } } +/* + * if the two SIDs (roughly equivalent to a UUID for a user or group) are + * the same returns zero, if they do not match returns non-zero. + */ +static int +compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) +{ + int i; + int num_subauth, num_sat, num_saw; + + if ((!ctsid) || (!cwsid)) + return 1; + + /* compare the revision */ + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } + } + + /* compare all of the subauth values if any */ + num_sat = ctsid->num_subauth; + num_saw = cwsid->num_subauth; + num_subauth = num_sat < num_saw ? num_sat : num_saw; + if (num_subauth) { + for (i = 0; i < num_subauth; ++i) { + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (le32_to_cpu(ctsid->sub_auth[i]) > + le32_to_cpu(cwsid->sub_auth[i])) + return 1; + else + return -1; + } + } + } + + return 0; /* sids compare/match */ +} + static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { @@ -630,54 +680,6 @@ cifs_destroy_idmaptrees(void) spin_unlock(&gidsidlock); } -/* if the two SIDs (roughly equivalent to a UUID for a user or group) are - the same returns 1, if they do not match returns 0 */ -int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) -{ - int i; - int num_subauth, num_sat, num_saw; - - if ((!ctsid) || (!cwsid)) - return 1; - - /* compare the revision */ - if (ctsid->revision != cwsid->revision) { - if (ctsid->revision > cwsid->revision) - return 1; - else - return -1; - } - - /* compare all of the six auth values */ - for (i = 0; i < NUM_AUTHS; ++i) { - if (ctsid->authority[i] != cwsid->authority[i]) { - if (ctsid->authority[i] > cwsid->authority[i]) - return 1; - else - return -1; - } - } - - /* compare all of the subauth values if any */ - num_sat = ctsid->num_subauth; - num_saw = cwsid->num_subauth; - num_subauth = num_sat < num_saw ? num_sat : num_saw; - if (num_subauth) { - for (i = 0; i < num_subauth; ++i) { - if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { - if (le32_to_cpu(ctsid->sub_auth[i]) > - le32_to_cpu(cwsid->sub_auth[i])) - return 1; - else - return -1; - } - } - } - - return 0; /* sids compare/match */ -} - - /* copy ntsd, owner sid, and group sid from a security descriptor to another */ static void copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 80e0d66a403d..18c7521273a7 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -98,6 +98,4 @@ extern struct key_type cifs_idmap_key_type; extern const struct cred *root_cred; #endif /* KERNEL */ -extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); - #endif /* _CIFSACL_H */ -- cgit v1.2.1 From 36f87ee70f754d04e55518853e6fb30ed4732dda Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:37 -0500 Subject: cifs: make cifs_copy_sid handle a source sid with variable size subauth arrays ...and lift the restriction in id_to_sid upcall that the size must be at least as big as a full cifs_sid. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 10 ++++++++-- fs/cifs/cifsacl.h | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5a312eb45a92..141a944c9dfd 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -277,8 +277,14 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { - memcpy(dst, src, sizeof(*dst)); + int i; + + dst->revision = src->revision; dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS); + for (i = 0; i < NUM_AUTHS; ++i) + dst->authority[i] = src->authority[i]; + for (i = 0; i < dst->num_subauth; ++i) + dst->sub_auth[i] = src->sub_auth[i]; } static void @@ -427,7 +433,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) if (IS_ERR(sidkey)) { rc = -EINVAL; cFYI(1, "%s: Can't map and id to a SID", __func__); - } else if (sidkey->datalen < sizeof(struct cifs_sid)) { + } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " "(datalen=%hu)", __func__, sidkey->datalen); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 18c7521273a7..7e52f19f996f 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -64,6 +64,9 @@ struct cifs_sid { __le32 sub_auth[NUM_SUBAUTHS]; /* sub_auth[num_subauth] */ } __attribute__((packed)); +/* size of a struct cifs_sid, sans sub_auth array */ +#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) + struct cifs_acl { __le16 revision; /* revision level */ __le16 size; -- cgit v1.2.1 From 30c9d6cca526243abe6c08eb6fa03db9d2b1a630 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:37 -0500 Subject: cifs: redefine NUM_SUBAUTH constant from 5 to 15 According to several places on the Internet and the samba winbind code, this is hard limited to 15 in windows, not 5. This does balloon out the allocation of each by 40 bytes, but I don't see any alternative. Also, rename it to SID_MAX_SUB_AUTHORITIES to match the alleged name of this constant in the windows header files Finally, rename SIDLEN to SID_STRING_MAX, fix the value to reflect the change to SID_MAX_SUB_AUTHORITIES and document how it was determined. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 6 +++--- fs/cifs/cifsacl.h | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 141a944c9dfd..dd8d3df74298 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -280,7 +280,7 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) int i; dst->revision = src->revision; - dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS); + dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); for (i = 0; i < NUM_AUTHS; ++i) dst->authority[i] = src->authority[i]; for (i = 0; i < dst->num_subauth; ++i) @@ -383,7 +383,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) if (!npsidid) return -ENOMEM; - npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + npsidid->sidstr = kmalloc(SID_STRING_MAX, GFP_KERNEL); if (!npsidid->sidstr) { kfree(npsidid); return -ENOMEM; @@ -500,7 +500,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, if (!npsidid) return -ENOMEM; - npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + npsidid->sidstr = kmalloc(SID_STRING_MAX, GFP_KERNEL); if (!npsidid->sidstr) { kfree(npsidid); return -ENOMEM; diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 7e52f19f996f..8b980cd445c0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -24,7 +24,7 @@ #define NUM_AUTHS 6 /* number of authority fields */ -#define NUM_SUBAUTHS 5 /* number of sub authority fields */ +#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ #define NUM_WK_SIDS 7 /* number of well known sids */ #define SIDNAMELENGTH 20 /* long enough for the ones we care about */ #define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */ @@ -41,7 +41,20 @@ #define SIDOWNER 1 #define SIDGROUP 2 -#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */ + +/* + * Maximum size of a string representation of a SID: + * + * The fields are unsigned values in decimal. So: + * + * u8: max 3 bytes in decimal + * u32: max 10 bytes in decimal + * + * "S-" + 3 bytes for version field + 4 bytes for each authority field (3 bytes + * per number + 1 for '-') + 11 bytes for each subauthority field (10 bytes + * per number + 1 for '-') + NULL terminator. + */ +#define SID_STRING_MAX (195) #define SID_ID_MAPPED 0 #define SID_ID_PENDING 1 @@ -61,7 +74,7 @@ struct cifs_sid { __u8 revision; /* revision level */ __u8 num_subauth; __u8 authority[NUM_AUTHS]; - __le32 sub_auth[NUM_SUBAUTHS]; /* sub_auth[num_subauth] */ + __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ } __attribute__((packed)); /* size of a struct cifs_sid, sans sub_auth array */ -- cgit v1.2.1 From ee13b2ba7488475b47ae8dab2eebc4f5fd6838c5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:38 -0500 Subject: cifs: fix the format specifiers in sid_to_str The format specifiers are for signed values, but these are unsigned. Given that '-' is a delimiter between fields, I don't think you'd get what you'd expect if you got a value here that would overflow the sign bit. The version and authority fields are 8 bit values so use a "hh" length modifier there. The subauths are 32 bit values, so there's no need to use a "l" length modifier there. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index dd8d3df74298..9adcdb5a1001 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -199,27 +199,24 @@ static void sid_to_str(struct cifs_sid *sidptr, char *sidstr) { int i; - unsigned long saval; + unsigned int saval; char *strptr; strptr = sidstr; - sprintf(strptr, "%s", "S"); - strptr = sidstr + strlen(sidstr); - - sprintf(strptr, "-%d", sidptr->revision); + sprintf(strptr, "S-%hhu", sidptr->revision); strptr = sidstr + strlen(sidstr); for (i = 0; i < NUM_AUTHS; ++i) { if (sidptr->authority[i]) { - sprintf(strptr, "-%d", sidptr->authority[i]); + sprintf(strptr, "-%hhu", sidptr->authority[i]); strptr = sidstr + strlen(sidstr); } } for (i = 0; i < sidptr->num_subauth; ++i) { saval = le32_to_cpu(sidptr->sub_auth[i]); - sprintf(strptr, "-%ld", saval); + sprintf(strptr, "-%u", saval); strptr = sidstr + strlen(sidstr); } } -- cgit v1.2.1 From b1a6dc21d1a731fdb71fcd683ef856c6af0b3f23 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:38 -0500 Subject: cifs: remove uneeded __KERNEL__ block from cifsacl.h ...and make those symbols static in cifsacl.c. Nothing outside of that file refers to them. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 4 ++-- fs/cifs/cifsacl.h | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 9adcdb5a1001..42b3fe981a0a 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,7 +42,7 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; -const struct cred *root_cred; +static const struct cred *root_cred; static void shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, @@ -187,7 +187,7 @@ cifs_idmap_key_destroy(struct key *key) kfree(key->payload.data); } -struct key_type cifs_idmap_key_type = { +static struct key_type cifs_idmap_key_type = { .name = "cifs.idmap", .instantiate = cifs_idmap_key_instantiate, .destroy = cifs_idmap_key_destroy, diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 8b980cd445c0..249c94f39635 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -109,9 +109,4 @@ struct cifs_sid_id { struct cifs_sid sid; }; -#ifdef __KERNEL__ -extern struct key_type cifs_idmap_key_type; -extern const struct cred *root_cred; -#endif /* KERNEL */ - #endif /* _CIFSACL_H */ -- cgit v1.2.1 From d3d1fce11dbbf4246f1c37839b13757f08aec3b7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:40 -0500 Subject: cifs: don't override the uid/gid in getattr when cifsacl is enabled If we're using cifsacl, then we don't want to override the uid/gid with the current uid/gid, since that would prevent you from being able to upcall for this info. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index afdff79651f1..ed6208ff85a7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, stat->ino = CIFS_I(inode)->uniqueid; /* - * If on a multiuser mount without unix extensions, and the admin hasn't - * overridden them, set the ownership to the fsuid/fsgid of the current - * process. + * If on a multiuser mount without unix extensions or cifsacl being + * enabled, and the admin hasn't overridden them, set the ownership + * to the fsuid/fsgid of the current process. */ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && !tcon->unix_ext) { if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) stat->uid = current_fsuid(); -- cgit v1.2.1 From e5e69abd058b3fcfd484dbe1c632347332cda9b6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:42 -0500 Subject: cifs: make error on lack of a unc= option more explicit Error out with a clear error message if there is no unc= option. The existing code doesn't handle this in a clear fashion, and the check for a UNCip option with no UNC string is just plain wrong. Later, we'll fix the code to not require a unc= option, but for now we need this to at least clarify why people are getting errors about DFS parsing. With this change we can also get rid of some later NULL pointer checks since we know the UNC and UNCip will never be NULL there. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 32fb50e7932b..a48387265cd4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1799,6 +1799,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } #endif + if (!vol->UNC) { + cERROR(1, "CIFS mount error: No UNC path (e.g. -o " + "unc=\\\\192.168.1.100\\public) specified"); + goto cifs_parse_mount_err; + } if (vol->UNCip == NULL) vol->UNCip = &vol->UNC[2]; @@ -2070,17 +2075,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = -EINVAL; goto out_err; } - } else if (volume_info->UNCip) { - /* BB using ip addr as tcp_ses name to connect to the - DFS root below */ - cERROR(1, "Connecting to DFS root not implemented yet"); - rc = -EINVAL; - goto out_err; - } else /* which tcp_sess DFS root would we conect to */ { - cERROR(1, "CIFS mount error: No UNC path (e.g. -o " - "unc=//192.168.1.100/public) specified"); - rc = -EINVAL; - goto out_err; } /* see if we already have a matching tcp_ses */ @@ -2726,9 +2720,6 @@ cifs_match_super(struct super_block *sb, void *data) volume_info = mnt_data->vol; - if (!volume_info->UNCip || !volume_info->UNC) - goto out; - rc = cifs_fill_sockaddr((struct sockaddr *)&addr, volume_info->UNCip, strlen(volume_info->UNCip), -- cgit v1.2.1 From 6d3ea7e4975aed451fbee4dea2fef63b0de8cb4f Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 28 Nov 2012 22:34:41 -0600 Subject: CIFS: Make use of common cifs_build_path_to_root for CIFS and SMB2 because the is no difference here. This also adds support of prefixpath mount option for SMB2. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++-- fs/cifs/cifsglob.h | 12 ------------ fs/cifs/cifsproto.h | 3 +++ fs/cifs/connect.c | 12 ++++++++---- fs/cifs/dir.c | 31 +++++++++++++++++++++++++++++++ fs/cifs/smb1ops.c | 32 -------------------------------- fs/cifs/smb2ops.c | 18 ------------------ 7 files changed, 44 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 07a8ab527c3a..273b34904d5b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -539,8 +539,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; - full_path = build_path_to_root(vol, cifs_sb, - cifs_sb_master_tcon(cifs_sb)); + full_path = cifs_build_path_to_root(vol, cifs_sb, + cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) return ERR_PTR(-ENOMEM); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2cd5ea2042ed..d1a93d32db81 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -280,9 +280,6 @@ struct smb_version_operations { /* set attributes */ int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, const unsigned int); - /* build a full path to the root of the mount */ - char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *, - struct cifs_tcon *); /* check if we can send an echo or nor */ bool (*can_echo)(struct TCP_Server_Info *); /* send echo request */ @@ -1084,15 +1081,6 @@ convert_delimiter(char *path, char delim) } } -static inline char * -build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - if (!vol->ops->build_path_to_root) - return NULL; - return vol->ops->build_path_to_root(vol, cifs_sb, tcon); -} - #ifdef CONFIG_CIFS_STATS #define cifs_stats_inc atomic_inc diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5144e9fbeb8c..7494358ba533 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -60,6 +60,9 @@ extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); extern void cifs_destroy_idmaptrees(void); extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct smb_vol *vol, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a48387265cd4..5ce5686353f1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3261,8 +3261,10 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info) #ifdef CONFIG_CIFS_DFS_UPCALL -/* build_path_to_root returns full path to root when - * we do not have an exiting connection (tcon) */ +/* + * cifs_build_path_to_root returns full path to root when we do not have an + * exiting connection (tcon) + */ static char * build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) @@ -3518,8 +3520,10 @@ remote_path_check: rc = -ENOSYS; goto mount_fail_check; } - /* build_path_to_root works only when we have a valid tcon */ - full_path = build_path_to_root(volume_info, cifs_sb, tcon); + /* + * cifs_build_path_to_root works only when we have a valid tcon + */ + full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d3671f2acb29..3b7e0c1266f7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -44,6 +44,37 @@ renew_parental_timestamps(struct dentry *direntry) } while (!IS_ROOT(direntry)); } +char * +cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon) +{ + int pplen = vol->prepath ? strlen(vol->prepath) : 0; + int dfsplen; + char *full_path = NULL; + + /* if no prefix path, simply set path to the root of share to "" */ + if (pplen == 0) { + full_path = kzalloc(1, GFP_KERNEL); + return full_path; + } + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; + + full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + + if (dfsplen) + strncpy(full_path, tcon->treeName, dfsplen); + strncpy(full_path + dfsplen, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + full_path[dfsplen + pplen] = 0; /* add trailing null */ + return full_path; +} + /* Note: caller must free return buffer */ char * build_path_from_dentry(struct dentry *direntry) diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 34cea2798333..a5d234c8d5d9 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); } -static char * -cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - int pplen = vol->prepath ? strlen(vol->prepath) : 0; - int dfsplen; - char *full_path = NULL; - - /* if no prefix path, simply set path to the root of share to "" */ - if (pplen == 0) { - full_path = kzalloc(1, GFP_KERNEL); - return full_path; - } - - if (tcon->Flags & SMB_SHARE_IS_IN_DFS) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); - else - dfsplen = 0; - - full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); - if (full_path == NULL) - return full_path; - - if (dfsplen) - strncpy(full_path, tcon->treeName, dfsplen); - strncpy(full_path + dfsplen, vol->prepath, pplen); - convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - full_path[dfsplen + pplen] = 0; /* add trailing null */ - return full_path; -} - static void cifs_clear_stats(struct cifs_tcon *tcon) { @@ -943,7 +912,6 @@ struct smb_version_operations smb1_operations = { .set_path_size = CIFSSMBSetEOF, .set_file_size = CIFSSMBSetFileSize, .set_file_info = smb_set_file_info, - .build_path_to_root = cifs_build_path_to_root, .echo = CIFSSMBEcho, .mkdir = CIFSSMBMkDir, .mkdir_setinfo = cifs_mkdir_setinfo, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 4d9dbe0b7385..137aaf8d6f38 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static char * -smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - int pplen = vol->prepath ? strlen(vol->prepath) : 0; - char *full_path = NULL; - - /* if no prefix path, simply set path to the root of share to "" */ - if (pplen == 0) { - full_path = kzalloc(2, GFP_KERNEL); - return full_path; - } - - cERROR(1, "prefixpath is not supported for SMB2 now"); - return NULL; -} - static bool smb2_can_echo(struct TCP_Server_Info *server) { @@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, - .build_path_to_root = smb2_build_path_to_root, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, -- cgit v1.2.1 From 9ec3c882879d3777914d34c0143c7d5b87dbb5ea Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 22 Nov 2012 17:00:10 +0400 Subject: CIFS: Separate pushing posix locks and lock_sem handling Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 70b6f4c3a0c1..5fbbf99e61f9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1041,9 +1041,8 @@ struct lock_to_push { }; static int -cifs_push_posix_locks(struct cifsFileInfo *cfile) +cifs_push_posix_locks_locked(struct cifsFileInfo *cfile) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; unsigned int count = 0, i = 0; @@ -1054,14 +1053,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } - lock_flocks(); cifs_for_each_lock(cfile->dentry->d_inode, before) { if ((*before)->fl_flags & FL_POSIX) @@ -1127,9 +1118,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } out: - cinode->can_cache_brlcks = false; - up_write(&cinode->lock_sem); - free_xid(xid); return rc; err_out: @@ -1140,6 +1128,24 @@ err_out: goto out; } +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + int rc = 0; + + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (!cinode->can_cache_brlcks) { + up_write(&cinode->lock_sem); + return rc; + } + rc = cifs_push_posix_locks_locked(cfile); + cinode->can_cache_brlcks = false; + up_write(&cinode->lock_sem); + return rc; +} + static int cifs_push_locks(struct cifsFileInfo *cfile) { -- cgit v1.2.1 From b8db928b765b4b0fe1aec3eb7f1741fedbed9a33 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 22 Nov 2012 17:07:16 +0400 Subject: CIFS: Separate pushing mandatory locks and lock_sem handling Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 39 ++++++++++----------------------------- fs/cifs/smb2file.c | 12 ------------ 2 files changed, 10 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5fbbf99e61f9..1747cbff7ddf 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -948,7 +948,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) int rc = 0, stored_rc; struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; int types[] = {LOCKING_ANDX_LARGE_FILES, @@ -958,21 +957,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) xid = get_xid(); tcon = tlink_tcon(cfile->tlink); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } - /* * Accessing maxBuf is racy with cifs_reconnect - need to store value * and check it for zero before using. */ max_buf = tcon->ses->server->maxBuf; if (!max_buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -EINVAL; } @@ -981,7 +971,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -ENOMEM; } @@ -1018,9 +1007,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) } } - cinode->can_cache_brlcks = false; - up_write(&cinode->lock_sem); - kfree(buf); free_xid(xid); return rc; @@ -1041,7 +1027,7 @@ struct lock_to_push { }; static int -cifs_push_posix_locks_locked(struct cifsFileInfo *cfile) +cifs_push_posix_locks(struct cifsFileInfo *cfile) { struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; @@ -1129,9 +1115,11 @@ err_out: } static int -cifs_push_posix_locks(struct cifsFileInfo *cfile) +cifs_push_locks(struct cifsFileInfo *cfile) { + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; /* we are going to update can_cache_brlcks here - need a write access */ @@ -1140,24 +1128,17 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) up_write(&cinode->lock_sem); return rc; } - rc = cifs_push_posix_locks_locked(cfile); - cinode->can_cache_brlcks = false; - up_write(&cinode->lock_sem); - return rc; -} - -static int -cifs_push_locks(struct cifsFileInfo *cfile) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return cifs_push_posix_locks(cfile); + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); - return tcon->ses->server->ops->push_mand_locks(cfile); + cinode->can_cache_brlcks = false; + up_write(&cinode->lock_sem); + return rc; } static void diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index a93eec30a50d..71e6aed4b382 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifs_fid_locks *fdlocks; xid = get_xid(); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } /* * Accessing maxBuf is racy with cifs_reconnect - need to store value @@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) */ max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; if (!max_buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -EINVAL; } @@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) max_num = max_buf / sizeof(struct smb2_lock_element); buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -ENOMEM; } @@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) rc = stored_rc; } - cinode->can_cache_brlcks = false; kfree(buf); - - up_write(&cinode->lock_sem); free_xid(xid); return rc; } -- cgit v1.2.1 From f152fd5fffa78910c467b17f12d0aa060aa408a6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 22 Nov 2012 17:10:57 +0400 Subject: CIFS: Implement cifs_relock_file that reacquires byte-range locks when a file is reopened. Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1747cbff7ddf..67fe0b811f23 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -505,16 +505,36 @@ out: return rc; } +static int cifs_push_posix_locks(struct cifsFileInfo *cfile); + /* * Try to reacquire byte range locks that were released when session - * to server was lost + * to server was lost. */ -static int cifs_relock_file(struct cifsFileInfo *cifsFile) +static int +cifs_relock_file(struct cifsFileInfo *cfile) { + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - /* BB list all locks open on this file and relock */ + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (cinode->can_cache_brlcks) { + /* can cache locks - no need to push them */ + up_write(&cinode->lock_sem); + return rc; + } + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); + up_write(&cinode->lock_sem); return rc; } -- cgit v1.2.1 From 21cb2d90c76cbc951da3a266f0dd439d64f3114a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 22 Nov 2012 18:56:39 +0400 Subject: CIFS: Fix lock consistensy bug in cifs_setlk If we netogiate mandatory locking style, have a read lock and try to set a write lock we end up with a write lock in vfs cache and no lock in cifs lock cache - that's wrong. Fix it by returning from cifs_setlk immediately if a error occurs during setting a lock. Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 67fe0b811f23..bceffa8c034e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1443,16 +1443,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, return -ENOMEM; rc = cifs_lock_add_if(cfile, lock, wait_flag); - if (rc < 0) + if (rc < 0) { kfree(lock); - if (rc <= 0) + return rc; + } + if (!rc) goto out; rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 1, 0, wait_flag); if (rc) { kfree(lock); - goto out; + return rc; } cifs_lock_add(cfile, lock); -- cgit v1.2.1 From dd446b16edd74ca525208d924d426f786dd973f8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 28 Nov 2012 23:21:06 -0600 Subject: Add SMB2.02 dialect support This patch enables optional for original SMB2 (SMB2.02) dialect by specifying vers=2.0 on mount. Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/connect.c | 5 +++++ fs/cifs/smb2ops.c | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d1a93d32db81..ac66409fb9d3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -178,6 +178,7 @@ struct smb_rqst { enum smb_version { Smb_1 = 1, + Smb_20, Smb_21, Smb_30, }; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5ce5686353f1..d01c7328dbae 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -274,6 +274,7 @@ static const match_table_t cifs_cacheflavor_tokens = { static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, }; @@ -1074,6 +1075,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->vals = &smb1_values; break; #ifdef CONFIG_CIFS_SMB2 + case Smb_20: + vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->vals = &smb20_values; + break; case Smb_21: vol->ops = &smb21_operations; vol->vals = &smb21_values; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 137aaf8d6f38..ad4d96a4bff5 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -625,6 +625,23 @@ struct smb_version_operations smb21_operations = { .new_lease_key = smb2_new_lease_key, }; +struct smb_version_values smb20_values = { + .version_string = SMB20_VERSION_STRING, + .protocol_id = SMB20_PROT_ID, + .req_capabilities = 0, /* MBZ */ + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, +}; + struct smb_version_values smb21_values = { .version_string = SMB21_VERSION_STRING, .protocol_id = SMB21_PROT_ID, -- cgit v1.2.1 From 3c15b4cf5580658951115f85efb2dea6a1380999 Mon Sep 17 00:00:00 2001 From: Jesper Nilsson Date: Thu, 29 Nov 2012 17:31:16 +0100 Subject: cifs: Add handling of blank password option The option to have a blank "pass=" already exists, and with a password specified both "pass=%s" and "password=%s" are supported. Also, both blank "user=" and "username=" are supported, making "password=" the odd man out. Signed-off-by: Jesper Nilsson Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d01c7328dbae..dec7c15d886a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_user, "user=%s" }, { Opt_user, "username=%s" }, { Opt_blank_pass, "pass=" }, + { Opt_blank_pass, "password=" }, { Opt_pass, "pass=%s" }, { Opt_pass, "password=%s" }, { Opt_blank_ip, "ip=" }, -- cgit v1.2.1 From ccb5c001b3035ca470fe21424e439530ba838510 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 25 Nov 2012 08:00:40 -0500 Subject: cifs: ensure we revalidate the inode after readdir if cifsacl is enabled Otherwise, "ls -l" will simply show the ownership of the files as the default mnt_uid/gid. This may make "ls -l" performance on large directories super-suck in some cases, but that's the cost of cifsacl. One possibility to make it suck less would be to somehow proactively dispatch the ACL requests asynchronously from readdir codepath, but that's non-trivial to implement. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1c576e871366..64920920d908 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -137,6 +137,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~S_IWUGO; + /* + * We of course don't get ACL info in FIND_FIRST/NEXT results, so + * mark it for revalidation so that "ls -l" will look right. It might + * be super-slow, but if we don't do this then the ownership of files + * may look wrong since the inodes may not have timed out by the time + * "ls" does a stat() call on them. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && fattr->cf_cifsattrs & ATTR_SYSTEM) { if (fattr->cf_eof == 0) { -- cgit v1.2.1 From b979aaa1777259330435c47f900833dabe9189e8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Nov 2012 11:09:55 -0500 Subject: cifs: get rid of smb_vol->UNCip and smb_vol->port Passing this around as a string is contorted and painful. Instead, just convert these to a sockaddr as soon as possible, since that's how we're going to work with it later anyway. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 3 +- fs/cifs/cifsproto.h | 4 +-- fs/cifs/connect.c | 91 +++++++++++++++++++---------------------------------- fs/cifs/netmisc.c | 14 +-------- 4 files changed, 36 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ac66409fb9d3..052d85b333f3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -394,7 +394,6 @@ struct smb_vol { char *password; char *domainname; char *UNC; - char *UNCip; char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ @@ -442,11 +441,11 @@ struct smb_vol { unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; - unsigned short int port; unsigned long actimeo; /* attribute cache timeout (jiffies) */ struct smb_version_operations *ops; struct smb_version_values *vals; char *prepath; + struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 7494358ba533..15a8cb66a07b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -110,9 +110,7 @@ extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); -extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); -extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port); +extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port); extern int map_smb_to_linux_error(char *buf, bool logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifs_tcon *, int /* length of diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index dec7c15d886a..428d8a12b827 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1114,6 +1114,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *string = NULL; char *tmp_end, *value; char delim; + bool got_ip = false; + unsigned short port = 0; + struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; separator[0] = ','; separator[1] = 0; @@ -1422,12 +1425,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->dir_mode = option; break; case Opt_port: - if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid port value", - __func__); + if (get_option_ul(args, &option) || + option > USHRT_MAX) { + cERROR(1, "%s: Invalid port value", __func__); goto cifs_parse_mount_err; } - vol->port = option; + port = (unsigned short)option; break; case Opt_rsize: if (get_option_ul(args, &option)) { @@ -1543,25 +1546,21 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->password[j] = '\0'; break; case Opt_blank_ip: - vol->UNCip = NULL; + /* FIXME: should this be an error instead? */ + got_ip = false; break; case Opt_ip: string = match_strdup(args); if (string == NULL) goto out_nomem; - if (strnlen(string, INET6_ADDRSTRLEN) > - INET6_ADDRSTRLEN) { - printk(KERN_WARNING "CIFS: ip address " - "too long\n"); - goto cifs_parse_mount_err; - } - vol->UNCip = kstrdup(string, GFP_KERNEL); - if (!vol->UNCip) { - printk(KERN_WARNING "CIFS: no memory " - "for UNC IP\n"); + if (!cifs_convert_address(dstaddr, string, + strlen(string))) { + printk(KERN_ERR "CIFS: bad ip= option (%s).\n", + string); goto cifs_parse_mount_err; } + got_ip = true; break; case Opt_unc: string = match_strdup(args); @@ -1811,8 +1810,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } - if (vol->UNCip == NULL) - vol->UNCip = &vol->UNC[2]; + if (!got_ip) { + /* No ip= option specified? Try to get it from UNC */ + if (!cifs_convert_address(dstaddr, &vol->UNC[2], + strlen(&vol->UNC[2]))) { + printk(KERN_ERR "Unable to determine destination " + "address.\n"); + goto cifs_parse_mount_err; + } + } + + /* set the port that we got earlier */ + cifs_set_port(dstaddr, port); if (uid_specified) vol->override_uid = override_uid; @@ -2062,29 +2071,13 @@ static struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; - struct sockaddr_storage addr; - struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; - struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; + struct sockaddr *dstaddr = (struct sockaddr *)&volume_info->dstaddr; int rc; - memset(&addr, 0, sizeof(struct sockaddr_storage)); - - cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); - - if (volume_info->UNCip && volume_info->UNC) { - rc = cifs_fill_sockaddr((struct sockaddr *)&addr, - volume_info->UNCip, - strlen(volume_info->UNCip), - volume_info->port); - if (!rc) { - /* we failed translating address */ - rc = -EINVAL; - goto out_err; - } - } + cFYI(1, "UNC: %s", volume_info->UNC); /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); + tcp_ses = cifs_find_tcp_session(dstaddr, volume_info); if (tcp_ses) return tcp_ses; @@ -2140,15 +2133,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) sizeof(tcp_ses->srcaddr)); ++tcp_ses->srv_count; - if (addr.ss_family == AF_INET6) { - cFYI(1, "attempting ipv6 connect"); - /* BB should we allow ipv6 on port 139? */ - /* other OS never observed in Wild doing 139 with v6 */ - memcpy(&tcp_ses->dstaddr, sin_server6, - sizeof(struct sockaddr_in6)); - } else - memcpy(&tcp_ses->dstaddr, sin_server, - sizeof(struct sockaddr_in)); + memcpy(&tcp_ses->dstaddr, dstaddr, sizeof(tcp_ses->dstaddr)); rc = ip_connect(tcp_ses); if (rc < 0) { @@ -2708,11 +2693,9 @@ cifs_match_super(struct super_block *sb, void *data) struct cifs_ses *ses; struct cifs_tcon *tcon; struct tcon_link *tlink; - struct sockaddr_storage addr; + struct sockaddr *dstaddr; int rc = 0; - memset(&addr, 0, sizeof(struct sockaddr_storage)); - spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); @@ -2725,15 +2708,9 @@ cifs_match_super(struct super_block *sb, void *data) tcp_srv = ses->server; volume_info = mnt_data->vol; + dstaddr = (struct sockaddr *)&volume_info->dstaddr; - rc = cifs_fill_sockaddr((struct sockaddr *)&addr, - volume_info->UNCip, - strlen(volume_info->UNCip), - volume_info->port); - if (!rc) - goto out; - - if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) || + if (!match_server(tcp_srv, dstaddr, volume_info) || !match_session(ses, volume_info) || !match_tcon(tcon, volume_info->UNC)) { rc = 0; @@ -3248,8 +3225,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); - if (volume_info->UNCip != volume_info->UNC + 2) - kfree(volume_info->UNCip); kfree(volume_info->UNC); kfree(volume_info->domainname); kfree(volume_info->iocharset); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d5ce9e26696c..a82bc51fdc82 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) return rc; } -int +void cifs_set_port(struct sockaddr *addr, const unsigned short int port) { switch (addr->sa_family) { @@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port) case AF_INET6: ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; - default: - return 0; } - return 1; -} - -int -cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port) -{ - if (!cifs_convert_address(dst, src, len)) - return 0; - return cifs_set_port(dst, port); } /***************************************************************************** -- cgit v1.2.1 From 1cc9bd68617f2a92dcd6e4398288341d16cfb5c1 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 29 Nov 2012 18:07:51 -0600 Subject: make convert_delimiter use strchr instead of open-coding it Take advantage of accelerated strchr() on arches that support it. Also, no caller ever passes in a NULL pointer. Get rid of the unneeded NULL pointer check. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 052d85b333f3..74a07b604ffd 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1064,21 +1064,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) static inline void convert_delimiter(char *path, char delim) { - int i; - char old_delim; - - if (path == NULL) - return; + char old_delim, *pos; if (delim == '/') old_delim = '\\'; else old_delim = '/'; - for (i = 0; path[i] != '\0'; i++) { - if (path[i] == old_delim) - path[i] = delim; - } + pos = path; + while ((pos = strchr(pos, old_delim))) + *pos = delim; } #ifdef CONFIG_CIFS_STATS -- cgit v1.2.1 From 9fa114f74feb140ac93e5983428c8f9312ffd6c2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Nov 2012 11:09:57 -0500 Subject: cifs: remove unneeded address argument from cifs_find_tcp_session and match_server Now that the smb_vol contains the destination sockaddr, there's no need to pass it in separately. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 428d8a12b827..87fa16549f27 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1992,9 +1992,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol) return true; } -static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, - struct smb_vol *vol) +static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) { + struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; + if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; @@ -2015,13 +2016,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) +cifs_find_tcp_session(struct smb_vol *vol) { struct TCP_Server_Info *server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - if (!match_server(server, addr, vol)) + if (!match_server(server, vol)) continue; ++server->srv_count; @@ -2071,13 +2072,12 @@ static struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; - struct sockaddr *dstaddr = (struct sockaddr *)&volume_info->dstaddr; int rc; cFYI(1, "UNC: %s", volume_info->UNC); /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(dstaddr, volume_info); + tcp_ses = cifs_find_tcp_session(volume_info); if (tcp_ses) return tcp_ses; @@ -2122,19 +2122,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info) INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); - + memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, + sizeof(tcp_ses->srcaddr)); + memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, + sizeof(tcp_ses->dstaddr)); /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet * no need to spinlock this init of tcpStatus or srv_count */ tcp_ses->tcpStatus = CifsNew; - memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, - sizeof(tcp_ses->srcaddr)); ++tcp_ses->srv_count; - memcpy(&tcp_ses->dstaddr, dstaddr, sizeof(tcp_ses->dstaddr)); - rc = ip_connect(tcp_ses); if (rc < 0) { cERROR(1, "Error connecting to socket. Aborting operation"); @@ -2693,7 +2692,6 @@ cifs_match_super(struct super_block *sb, void *data) struct cifs_ses *ses; struct cifs_tcon *tcon; struct tcon_link *tlink; - struct sockaddr *dstaddr; int rc = 0; spin_lock(&cifs_tcp_ses_lock); @@ -2708,9 +2706,8 @@ cifs_match_super(struct super_block *sb, void *data) tcp_srv = ses->server; volume_info = mnt_data->vol; - dstaddr = (struct sockaddr *)&volume_info->dstaddr; - if (!match_server(tcp_srv, dstaddr, volume_info) || + if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || !match_tcon(tcon, volume_info->UNC)) { rc = 0; -- cgit v1.2.1 From 6ee9542a8701a906dbe5141bf1e1ad395d957222 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Nov 2012 11:09:57 -0500 Subject: cifs: always zero out smb_vol before parsing options Currently, the code relies on the callers to do that and they all do, but this will ensure that it's always done. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 87fa16549f27..290c13442f75 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1122,6 +1122,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, separator[1] = 0; delim = separator[0]; + /* ensure we always start with zeroed-out smb_vol */ + memset(vol, 0, sizeof(*vol)); + /* * does not have to be perfect mapping since field is * informational, only used for servers that do not support @@ -3314,7 +3317,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, mdata = NULL; } else { cleanup_volume_info_contents(volume_info); - memset(volume_info, '\0', sizeof(*volume_info)); rc = cifs_setup_volume_info(volume_info, mdata, fake_devname); } @@ -3336,7 +3338,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, if (cifs_parse_mount_options(mount_data, devname, volume_info)) return -EINVAL; - if (volume_info->nullauth) { cFYI(1, "Anonymous login"); kfree(volume_info->username); @@ -3373,7 +3374,7 @@ cifs_get_volume_info(char *mount_data, const char *devname) int rc; struct smb_vol *volume_info; - volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL); if (!volume_info) return ERR_PTR(-ENOMEM); -- cgit v1.2.1 From 176c9b3939d22bb1177eb15010e600bc59a1b0b5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 29 Nov 2012 11:37:18 -0800 Subject: cifs: Remove unused cEVENT macro It uses an undefined KERN_EVENT and is itself unused. Signed-off-by: Joe Perches Reviewed-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Steve French --- fs/cifs/cifs_debug.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index c0c68bb492d7..b0fc344eb857 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -63,12 +63,6 @@ do { \ /* debug event message: */ extern int cifsERROR; -#define cEVENT(fmt, arg...) \ -do { \ - if (cifsERROR) \ - printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ -} while (0) - /* error event message: e.g., i/o error */ #define cifserror(fmt, arg...) \ do { \ @@ -88,7 +82,6 @@ do { \ */ #else /* _CIFS_DEBUG */ #define cERROR(set, fmt, arg...) -#define cEVENT(fmt, arg...) #define cFYI(set, fmt, arg...) #define cifserror(fmt, arg...) #endif /* _CIFS_DEBUG */ -- cgit v1.2.1 From 52c0f4ad8ed462d81f1d37f56a74a71dc0c9bf0f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 4 Dec 2012 16:56:37 -0600 Subject: SMB3 mounts fail with access denied to some servers We were checking incorrectly if signatures were required to be sent, so were always sending signatures after the initial session establishment. For SMB3 mounts (vers=3.0) this was a problem because we were putting SMB2 signatures in SMB3 requests which would cause access denied on mount (the tree connection would fail). This might also be worth considering for stable (for 3.7), as the error message on mount (access denied) is confusing to users and there is no workaround if the server is configured to only support smb3.0. I am ok either way. CC: stable Signed-off-by: Steve French Reviewed-by: Jeff Layton --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cf33622cdac8..e7f9dbc33ce2 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } cFYI(1, "sec_flags 0x%x", sec_flags); - if (sec_flags & CIFSSEC_MUST_SIGN) { + if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { cFYI(1, "Signing required"); if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | SMB2_NEGOTIATE_SIGNING_ENABLED))) { -- cgit v1.2.1 From bde98197310fd085ee4bb00ab310abcbe55b0664 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Dec 2012 12:42:47 -0800 Subject: cifs: Make CIFS_DEBUG possible to undefine Make the compilation work again when CIFS_DEBUG is not #define'd. Add format and argument verification for the various macros when CIFS_DEBUG is not #define'd. Signed-off-by: Joe Perches Reviewed-by: Jeff Layton --- fs/cifs/cifs_debug.h | 64 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index b0fc344eb857..4d12fe48fb50 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -37,6 +37,9 @@ void dump_smb(void *, int); #define CIFS_RC 0x02 #define CIFS_TIMER 0x04 +extern int cifsFYI; +extern int cifsERROR; + /* * debug ON * -------- @@ -44,36 +47,33 @@ void dump_smb(void *, int); #ifdef CIFS_DEBUG /* information message: e.g., configuration, major event */ -extern int cifsFYI; -#define cifsfyi(fmt, arg...) \ +#define cifsfyi(fmt, ...) \ do { \ if (cifsFYI & CIFS_INFO) \ - printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ + printk(KERN_DEBUG "%s: " fmt "\n", \ + __FILE__, ##__VA_ARGS__); \ } while (0) -#define cFYI(set, fmt, arg...) \ -do { \ - if (set) \ - cifsfyi(fmt, ##arg); \ +#define cFYI(set, fmt, ...) \ +do { \ + if (set) \ + cifsfyi(fmt, ##__VA_ARGS__); \ } while (0) -#define cifswarn(fmt, arg...) \ - printk(KERN_WARNING fmt "\n", ##arg) - -/* debug event message: */ -extern int cifsERROR; +#define cifswarn(fmt, ...) \ + printk(KERN_WARNING fmt "\n", ##__VA_ARGS__) /* error event message: e.g., i/o error */ -#define cifserror(fmt, arg...) \ -do { \ - if (cifsERROR) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \ +#define cifserror(fmt, ...) \ +do { \ + if (cifsERROR) \ + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ } while (0) -#define cERROR(set, fmt, arg...) \ -do { \ - if (set) \ - cifserror(fmt, ##arg); \ +#define cERROR(set, fmt, ...) \ +do { \ + if (set) \ + cifserror(fmt, ##__VA_ARGS__); \ } while (0) /* @@ -81,9 +81,27 @@ do { \ * --------- */ #else /* _CIFS_DEBUG */ -#define cERROR(set, fmt, arg...) -#define cFYI(set, fmt, arg...) -#define cifserror(fmt, arg...) +#define cifsfyi(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG "%s: " fmt "\n", \ + __FILE__, ##__VA_ARGS__); \ +} while (0) +#define cFYI(set, fmt, ...) \ +do { \ + if (0 && set) \ + cifsfyi(fmt, ##__VA_ARGS__); \ +} while (0) +#define cifserror(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ +} while (0) +#define cERROR(set, fmt, ...) \ +do { \ + if (0 && set) \ + cifserror(fmt, ##__VA_ARGS__); \ +} while (0) #endif /* _CIFS_DEBUG */ #endif /* _H_CIFS_DEBUG */ -- cgit v1.2.1 From 471b1f98719a8e8f34f3a696d488e50754f8cf73 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Dec 2012 12:42:58 -0800 Subject: cifs: Add CONFIG_CIFS_DEBUG and rename use of CIFS_DEBUG This can reduce the size of the module by ~120KB which could be useful for embedded systems. $ size fs/cifs/built-in.o* text data bss dec hex filename 388567 34459 100440 523466 7fcca fs/cifs/built-in.o.new 495970 34599 117904 648473 9e519 fs/cifs/built-in.o.old Signed-off-by: Joe Perches Reviewed-by: Jeff Layton --- fs/cifs/Kconfig | 10 +++++++++- fs/cifs/cifs_debug.h | 3 +-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 2075ddfffa73..21ff76c22a17 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -122,9 +122,17 @@ config CIFS_ACL Allows fetching CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. +config CIFS_DEBUG + bool "Enable CIFS debugging routines" + default y + depends on CIFS + help + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" - depends on CIFS + depends on CIFS_DEBUG help Enabling this option adds a few more debugging routines to the cifs code which slightly increases the size of diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 4d12fe48fb50..86e92ef2abc1 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -18,7 +18,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ -#define CIFS_DEBUG /* BB temporary */ #ifndef _H_CIFS_DEBUG #define _H_CIFS_DEBUG @@ -44,7 +43,7 @@ extern int cifsERROR; * debug ON * -------- */ -#ifdef CIFS_DEBUG +#ifdef CONFIG_CIFS_DEBUG /* information message: e.g., configuration, major event */ #define cifsfyi(fmt, ...) \ -- cgit v1.2.1 From eb1b3fa5cdb9c27bdec8f262acf757a06588eb2d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:37 -0500 Subject: cifs: rename cifs_readdir_lookup to cifs_prime_dcache and make it void return The caller doesn't do anything with the dentry, so there's no point in holding a reference to it on return. Also cifs_prime_dcache better describes the actual purpose of the function. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 64920920d908..6002fdc920ae 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) #endif /* DEBUG2 */ /* + * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT + * * Find the dentry that matches "name". If there isn't one, create one. If it's * a negative dentry or the uniqueid changed, then drop it and recreate it. */ -static struct dentry * -cifs_readdir_lookup(struct dentry *parent, struct qstr *name, +static void +cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct cifs_fattr *fattr) { struct dentry *dentry, *alias; struct inode *inode; struct super_block *sb = parent->d_inode->i_sb; - cFYI(1, "For %s", name->name); + cFYI(1, "%s: for %s", __func__, name->name); if (parent->d_op && parent->d_op->d_hash) parent->d_op->d_hash(parent, parent->d_inode, name); @@ -87,37 +89,32 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { int err; + inode = dentry->d_inode; /* update inode in place if i_ino didn't change */ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { cifs_fattr_to_inode(inode, fattr); - return dentry; + goto out; } err = d_invalidate(dentry); dput(dentry); if (err) - return NULL; + return; } dentry = d_alloc(parent, name); - if (dentry == NULL) - return NULL; + if (!dentry) + return; inode = cifs_iget(sb, fattr); - if (!inode) { - dput(dentry); - return NULL; - } + if (!inode) + goto out; alias = d_materialise_unique(dentry, inode); - if (alias != NULL) { - dput(dentry); - if (IS_ERR(alias)) - return NULL; - dentry = alias; - } - - return dentry; + if (alias && !IS_ERR(alias)) + dput(alias); +out: + dput(dentry); } static void @@ -662,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_dirent de = { NULL, }; struct cifs_fattr fattr; - struct dentry *dentry; struct qstr name; int rc = 0; ino_t ino; @@ -733,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, */ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); + cifs_prime_dcache(file->f_dentry, &name, &fattr); + ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); rc = filldir(dirent, name.name, name.len, file->f_pos, ino, fattr.cf_dtype); - - dput(dentry); return rc; } -- cgit v1.2.1 From 464ee9f966404786ba4c6be35dc8362ee8e6ba4e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 12:49:27 -0500 Subject: NFSv4.1: Ensure that the client tracks the server target_highest_slotid Dynamic slot allocation in NFSv4.1 depends on the client being able to track the server's target value for the highest slotid in the slot table. See the reference in Section 2.10.6.1 of RFC5661. To avoid ordering problems in the case where 2 SEQUENCE replies contain conflicting updates to this target value, we also introduce a generation counter, to track whether or not an RPC containing a SEQUENCE operation was launched before or after the last update. Also rename the nfs4_slot_table target_max_slots field to 'target_highest_slotid' to avoid confusion with a slot table size or number of slots. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/nfs4proc.c | 25 +++++++++++++++++++++++++ fs/nfs/nfs4state.c | 7 +++---- fs/nfs/nfs4xdr.c | 4 ++-- 4 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 0be08b964f38..0ef047b7d28d 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -576,7 +576,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, if (args->crsa_target_max_slots == fc_tbl->max_slots) goto out; - fc_tbl->target_max_slots = args->crsa_target_max_slots; + fc_tbl->target_highest_slotid = args->crsa_target_max_slots; nfs41_handle_recall_slot(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 197ef3e4e1f7..d91abaa522e8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -488,6 +488,28 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) res->sr_slot = NULL; } +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + if (tbl->target_highest_slotid == target_highest_slotid) + return; + tbl->target_highest_slotid = target_highest_slotid; + tbl->generation++; +} + +static void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res) +{ + spin_lock(&tbl->slot_tbl_lock); + if (tbl->generation != slot->generation) + goto out; + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); +out: + spin_unlock(&tbl->slot_tbl_lock); +} + static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; @@ -522,6 +544,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * /* Check sequence flags */ if (res->sr_status_flags != 0) nfs4_schedule_lease_recovery(clp); + nfs41_update_target_slotid(slot->table, slot, res); break; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -583,6 +606,7 @@ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) tbl->highest_used_slotid = slotid; ret = &tbl->slots[slotid]; ret->renewal_time = jiffies; + ret->generation = tbl->generation; out: dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", @@ -5693,6 +5717,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, tbl->max_slots = max_slots; } tbl->highest_used_slotid = NFS4_NO_SLOT; + tbl->target_highest_slotid = max_slots - 1; for (i = 0; i < tbl->max_slots; i++) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9495789c425b..842cb8c2f65d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2033,17 +2033,16 @@ static int nfs4_recall_slot(struct nfs_client *clp) return 0; nfs4_begin_drain_session(clp); fc_tbl = &clp->cl_session->fc_slot_table; - new = nfs4_alloc_slots(fc_tbl, fc_tbl->target_max_slots, GFP_NOFS); + new = nfs4_alloc_slots(fc_tbl, fc_tbl->target_highest_slotid + 1, GFP_NOFS); if (!new) return -ENOMEM; spin_lock(&fc_tbl->slot_tbl_lock); - for (i = 0; i < fc_tbl->target_max_slots; i++) + for (i = 0; i <= fc_tbl->target_highest_slotid; i++) new[i].seq_nr = fc_tbl->slots[i].seq_nr; old = fc_tbl->slots; fc_tbl->slots = new; - fc_tbl->max_slots = fc_tbl->target_max_slots; - fc_tbl->target_max_slots = 0; + fc_tbl->max_slots = fc_tbl->target_highest_slotid + 1; clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots; spin_unlock(&fc_tbl->slot_tbl_lock); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 27b0fec1a6b0..05d34f1fcc19 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5552,8 +5552,8 @@ static int decode_sequence(struct xdr_stream *xdr, } /* highest slot id - currently not processed */ dummy = be32_to_cpup(p++); - /* target highest slot id - currently not processed */ - dummy = be32_to_cpup(p++); + /* target highest slot id */ + res->sr_target_highest_slotid = be32_to_cpup(p++); /* result flags */ res->sr_status_flags = be32_to_cpup(p); status = 0; -- cgit v1.2.1 From da0507b7c95ccd4d9c86394eef42fe076032af30 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 18:10:30 -0500 Subject: NFSv4.1: Reset the sequence number for slots that have been deallocated When the server tells us that it is dynamically resizing the session replay cache, we should reset the sequence number for those slots that have been deallocated. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 18 ++++++++++++++++++ fs/nfs/nfs4xdr.c | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d91abaa522e8..52435ec44193 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -498,6 +498,22 @@ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, tbl->generation++; } +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, + u32 highest_slotid) +{ + unsigned int max_slotid, i; + + if (tbl->server_highest_slotid == highest_slotid) + return; + if (tbl->highest_used_slotid > highest_slotid) + return; + max_slotid = min(tbl->max_slots - 1, highest_slotid); + /* Reset the seq_nr for deallocated slots */ + for (i = tbl->server_highest_slotid + 1; i <= max_slotid; i++) + tbl->slots[i].seq_nr = 1; + tbl->server_highest_slotid = highest_slotid; +} + static void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, struct nfs4_slot *slot, struct nfs4_sequence_res *res) @@ -505,6 +521,7 @@ static void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, spin_lock(&tbl->slot_tbl_lock); if (tbl->generation != slot->generation) goto out; + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); out: spin_unlock(&tbl->slot_tbl_lock); @@ -5718,6 +5735,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, } tbl->highest_used_slotid = NFS4_NO_SLOT; tbl->target_highest_slotid = max_slots - 1; + tbl->server_highest_slotid = max_slots - 1; for (i = 0; i < tbl->max_slots; i++) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 05d34f1fcc19..a67040f51597 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5550,8 +5550,8 @@ static int decode_sequence(struct xdr_stream *xdr, dprintk("%s Invalid slot id\n", __func__); goto out_err; } - /* highest slot id - currently not processed */ - dummy = be32_to_cpup(p++); + /* highest slot id */ + res->sr_highest_slotid = be32_to_cpup(p++); /* target highest slot id */ res->sr_target_highest_slotid = be32_to_cpup(p++); /* result flags */ -- cgit v1.2.1 From ce008c4bb9766bc7eeb02e8299c8baadc25da90b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 15:16:30 -0500 Subject: NFSv4.1: Fix nfs4_callback_recallslot to work with dynamic slot allocation Ensure that the NFSv4.1 CB_RECALL_SLOT callback updates the slot table target max slotid safely. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 0ef047b7d28d..15b9879d6fbb 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -576,7 +576,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, if (args->crsa_target_max_slots == fc_tbl->max_slots) goto out; - fc_tbl->target_highest_slotid = args->crsa_target_max_slots; + nfs41_set_target_slotid(fc_tbl, args->crsa_target_max_slots); nfs41_handle_recall_slot(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 42c58691fb41..5d4e82b10c3c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -260,6 +260,8 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, extern struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, u32 max_slots, gfp_t gfp_flags); +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 52435ec44193..62212231ce62 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -498,6 +498,14 @@ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, tbl->generation++; } +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + spin_lock(&tbl->slot_tbl_lock); + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, u32 highest_slotid) { -- cgit v1.2.1 From d5fb4ce33e26e4c1c31c1609b8ffbb24f80bcab8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 20:24:02 -0500 Subject: NFSv4.1: Don't confuse target_highest_slotid and max_slots in cb_recall_slot Don't confuse the table size and the target_highest_slotid... Signed-off-by: Trond Myklebust --- fs/nfs/callback.h | 2 +- fs/nfs/callback_proc.c | 12 +++++------- fs/nfs/callback_xdr.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 4251c2ae06ad..e75631e264f4 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -142,7 +142,7 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, struct cb_recallslotargs { struct sockaddr *crsa_addr; - uint32_t crsa_target_max_slots; + uint32_t crsa_target_highest_slotid; }; extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 15b9879d6fbb..ed0b446e2e38 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -561,22 +561,20 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, if (!cps->clp) /* set in cb_sequence */ goto out; - dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), - args->crsa_target_max_slots); + args->crsa_target_highest_slotid); fc_tbl = &cps->clp->cl_session->fc_slot_table; status = htonl(NFS4ERR_BAD_HIGH_SLOT); - if (args->crsa_target_max_slots > fc_tbl->max_slots || - args->crsa_target_max_slots < 1) + if (args->crsa_target_highest_slotid >= fc_tbl->max_slots || + args->crsa_target_highest_slotid < 1) goto out; status = htonl(NFS4_OK); - if (args->crsa_target_max_slots == fc_tbl->max_slots) - goto out; - nfs41_set_target_slotid(fc_tbl, args->crsa_target_max_slots); + nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); nfs41_handle_recall_slot(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 742ff4ffced7..81e8c7d4c2e8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -520,7 +520,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); - args->crsa_target_max_slots = ntohl(*p++); + args->crsa_target_highest_slotid = ntohl(*p++); return 0; } -- cgit v1.2.1 From 1b285ff16ab52fb401aed7ce70abed4bb65b30b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 22:32:48 -0500 Subject: NFSv4.1: Allow the server to recall all but one slot If the server wants to leave us with only one slot, or it wants to "shrink" our slot table to something larger than we have now, then so be it. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ed0b446e2e38..a0546eca6f6b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -567,11 +567,6 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, fc_tbl = &cps->clp->cl_session->fc_slot_table; - status = htonl(NFS4ERR_BAD_HIGH_SLOT); - if (args->crsa_target_highest_slotid >= fc_tbl->max_slots || - args->crsa_target_highest_slotid < 1) - goto out; - status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); -- cgit v1.2.1 From 97e548a93de213b149eea025a97d88e28143b445 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 14:45:48 -0500 Subject: NFSv4.1: Support dynamic resizing of the session slot table Allow the server to control the size of the session slot table by adjusting the value of sr_target_max_slots in the reply to the SEQUENCE operation. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++++++-- fs/nfs/nfs4state.c | 6 +++--- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 62212231ce62..1792ece8b53c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -492,10 +492,17 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, u32 target_highest_slotid) { + unsigned int max_slotid, i; + if (tbl->target_highest_slotid == target_highest_slotid) return; tbl->target_highest_slotid = target_highest_slotid; tbl->generation++; + + max_slotid = min(tbl->max_slots - 1, tbl->target_highest_slotid); + for (i = tbl->max_slotid + 1; i <= max_slotid; i++) + rpc_wake_up_next(&tbl->slot_tbl_waitq); + tbl->max_slotid = max_slotid; } void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, @@ -622,8 +629,8 @@ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, tbl->max_slots); - slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); - if (slotid >= tbl->max_slots) + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); + if (slotid > tbl->max_slotid) goto out; __set_bit(slotid, tbl->used_slots); if (slotid > tbl->highest_used_slotid || @@ -5744,6 +5751,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, tbl->highest_used_slotid = NFS4_NO_SLOT; tbl->target_highest_slotid = max_slots - 1; tbl->server_highest_slotid = max_slots - 1; + tbl->max_slotid = max_slots - 1; for (i = 0; i < tbl->max_slots; i++) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 842cb8c2f65d..1b7fa73c9436 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -254,15 +254,14 @@ static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; struct nfs4_slot_table *tbl; - int max_slots; + unsigned int i; if (ses == NULL) return; tbl = &ses->fc_slot_table; if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&tbl->slot_tbl_lock); - max_slots = tbl->max_slots; - while (max_slots--) { + for (i = 0; i <= tbl->max_slotid; i++) { if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs4_set_task_privileged, NULL) == NULL) @@ -2043,6 +2042,7 @@ static int nfs4_recall_slot(struct nfs_client *clp) old = fc_tbl->slots; fc_tbl->slots = new; fc_tbl->max_slots = fc_tbl->target_highest_slotid + 1; + fc_tbl->max_slotid = fc_tbl->target_highest_slotid; clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots; spin_unlock(&fc_tbl->slot_tbl_lock); -- cgit v1.2.1 From 87dda67e7386ba7d2164391ea58b34e028d8157b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 19:49:20 -0500 Subject: NFSv4.1: Allow SEQUENCE to resize the slot table on the fly Instead of an array of slots, use a singly linked list of slots that can be dynamically appended to or shrunk. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 4 +- fs/nfs/nfs4proc.c | 174 +++++++++++++++++++++++++++++++++++------------------ fs/nfs/nfs4state.c | 22 ++----- 3 files changed, 120 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5d4e82b10c3c..856bc496a210 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -258,10 +258,10 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); -extern struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, - u32 max_slots, gfp_t gfp_flags); extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, u32 target_highest_slotid); +extern int nfs4_resize_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1792ece8b53c..fc65300172e1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -396,6 +396,27 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp #if defined(CONFIG_NFS_V4_1) +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) +{ + struct nfs4_slot **p; + if (newsize >= tbl->max_slots) + return; + + p = &tbl->slots; + while (newsize--) + p = &(*p)->next; + while (*p) { + struct nfs4_slot *slot = *p; + + *p = slot->next; + kfree(slot); + tbl->max_slots--; + } +} + /* * nfs4_free_slot - free a slot and efficiently update slot table. * @@ -499,7 +520,7 @@ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, tbl->target_highest_slotid = target_highest_slotid; tbl->generation++; - max_slotid = min(tbl->max_slots - 1, tbl->target_highest_slotid); + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, tbl->target_highest_slotid); for (i = tbl->max_slotid + 1; i <= max_slotid; i++) rpc_wake_up_next(&tbl->slot_tbl_waitq); tbl->max_slotid = max_slotid; @@ -516,16 +537,12 @@ void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, u32 highest_slotid) { - unsigned int max_slotid, i; - if (tbl->server_highest_slotid == highest_slotid) return; if (tbl->highest_used_slotid > highest_slotid) return; - max_slotid = min(tbl->max_slots - 1, highest_slotid); - /* Reset the seq_nr for deallocated slots */ - for (i = tbl->server_highest_slotid + 1; i <= max_slotid; i++) - tbl->slots[i].seq_nr = 1; + /* Deallocate slots */ + nfs4_shrink_slot_table(tbl, highest_slotid + 1); tbl->server_highest_slotid = highest_slotid; } @@ -612,6 +629,42 @@ static int nfs4_sequence_done(struct rpc_task *task, return nfs41_sequence_done(task, res); } +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot *slot; + + slot = kzalloc(sizeof(*slot), gfp_mask); + if (slot) { + slot->table = tbl; + slot->slot_nr = slotid; + slot->seq_nr = seq_init; + } + return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot **p, *slot; + + p = &tbl->slots; + for (;;) { + if (*p == NULL) { + *p = nfs4_new_slot(tbl, tbl->max_slots, + seq_init, gfp_mask); + if (*p == NULL) + break; + tbl->max_slots++; + } + slot = *p; + if (slot->slot_nr == slotid) + return slot; + p = &slot->next; + } + return NULL; +} + /* * nfs4_alloc_slot - efficiently look for a free slot * @@ -628,15 +681,17 @@ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, - tbl->max_slots); + tbl->max_slotid + 1); slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); if (slotid > tbl->max_slotid) goto out; + ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + if (ret == NULL) + goto out; __set_bit(slotid, tbl->used_slots); if (slotid > tbl->highest_used_slotid || tbl->highest_used_slotid == NFS4_NO_SLOT) tbl->highest_used_slotid = slotid; - ret = &tbl->slots[slotid]; ret->renewal_time = jiffies; ret->generation = tbl->generation; @@ -5718,67 +5773,56 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } -struct nfs4_slot *nfs4_alloc_slots(struct nfs4_slot_table *table, - u32 max_slots, gfp_t gfp_flags) +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) { - struct nfs4_slot *tbl; - u32 i; - - tbl = kmalloc_array(max_slots, sizeof(*tbl), gfp_flags); - if (tbl != NULL) { - for (i = 0; i < max_slots; i++) { - tbl[i].table = table; - tbl[i].slot_nr = i; - } - } - return tbl; + if (max_reqs <= tbl->max_slots) + return 0; + if (nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)) + return 0; + return -ENOMEM; } -static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, - struct nfs4_slot *new, - u32 max_slots, +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, + u32 server_highest_slotid, u32 ivalue) { - struct nfs4_slot *old = NULL; - u32 i; + struct nfs4_slot **p; - spin_lock(&tbl->slot_tbl_lock); - if (new) { - old = tbl->slots; - tbl->slots = new; - tbl->max_slots = max_slots; + nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); + p = &tbl->slots; + while (*p) { + (*p)->seq_nr = ivalue; + p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; - tbl->target_highest_slotid = max_slots - 1; - tbl->server_highest_slotid = max_slots - 1; - tbl->max_slotid = max_slots - 1; - for (i = 0; i < tbl->max_slots; i++) - tbl->slots[i].seq_nr = ivalue; - spin_unlock(&tbl->slot_tbl_lock); - kfree(old); + tbl->target_highest_slotid = server_highest_slotid; + tbl->server_highest_slotid = server_highest_slotid; + tbl->max_slotid = server_highest_slotid; } /* * (re)Initialise a slot table */ -static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, - u32 ivalue) +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) { - struct nfs4_slot *new = NULL; - int ret = -ENOMEM; + int ret; dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, max_reqs, tbl->max_slots); - /* Does the newly negotiated max_reqs match the existing slot table? */ - if (max_reqs != tbl->max_slots) { - new = nfs4_alloc_slots(tbl, max_reqs, GFP_NOFS); - if (!new) - goto out; - } - ret = 0; + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + goto out; + + spin_lock(&tbl->slot_tbl_lock); + nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); + spin_unlock(&tbl->slot_tbl_lock); - nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); out: @@ -5786,18 +5830,28 @@ out: return ret; } +int nfs4_resize_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + int ret; + + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + return ret; + spin_lock(&tbl->slot_tbl_lock); + nfs4_shrink_slot_table(tbl, max_reqs); + tbl->max_slotid = max_reqs - 1; + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + /* Destroy the slot table */ static void nfs4_destroy_slot_tables(struct nfs4_session *session) { - if (session->fc_slot_table.slots != NULL) { - kfree(session->fc_slot_table.slots); - session->fc_slot_table.slots = NULL; - } - if (session->bc_slot_table.slots != NULL) { - kfree(session->bc_slot_table.slots); - session->bc_slot_table.slots = NULL; - } - return; + nfs4_shrink_slot_table(&session->fc_slot_table, 0); + nfs4_shrink_slot_table(&session->bc_slot_table, 0); } /* diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1b7fa73c9436..c14b2c7ac8a7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2025,29 +2025,15 @@ out: static int nfs4_recall_slot(struct nfs_client *clp) { struct nfs4_slot_table *fc_tbl; - struct nfs4_slot *new, *old; - int i; + u32 new_size; if (!nfs4_has_session(clp)) return 0; nfs4_begin_drain_session(clp); - fc_tbl = &clp->cl_session->fc_slot_table; - new = nfs4_alloc_slots(fc_tbl, fc_tbl->target_highest_slotid + 1, GFP_NOFS); - if (!new) - return -ENOMEM; - spin_lock(&fc_tbl->slot_tbl_lock); - for (i = 0; i <= fc_tbl->target_highest_slotid; i++) - new[i].seq_nr = fc_tbl->slots[i].seq_nr; - old = fc_tbl->slots; - fc_tbl->slots = new; - fc_tbl->max_slots = fc_tbl->target_highest_slotid + 1; - fc_tbl->max_slotid = fc_tbl->target_highest_slotid; - clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots; - spin_unlock(&fc_tbl->slot_tbl_lock); - - kfree(old); - return 0; + fc_tbl = &clp->cl_session->fc_slot_table; + new_size = fc_tbl->server_highest_slotid + 1; + return nfs4_resize_slot_table(fc_tbl, new_size, 1); } static int nfs4_bind_conn_to_session(struct nfs_client *clp) -- cgit v1.2.1 From afa296103ea3841fdc81d9d66902fe49bb765527 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Nov 2012 20:12:38 -0500 Subject: NFSv4.1: Remove the state manager code to resize the slot table The state manager no longer needs any special machinery to stop the session flow and resize the slot table. It is all done on the fly by the SEQUENCE op code now. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 1 - fs/nfs/nfs4_fs.h | 4 ---- fs/nfs/nfs4proc.c | 17 ----------------- fs/nfs/nfs4state.c | 33 --------------------------------- 4 files changed, 55 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a0546eca6f6b..8610bd1d136d 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -570,7 +570,6 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); - nfs41_handle_recall_slot(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 856bc496a210..fa1a055a8fe9 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -21,7 +21,6 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, - NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, NFS4CLNT_SERVER_SCOPE_MISMATCH, NFS4CLNT_PURGE_STATE, @@ -260,8 +259,6 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, u32 target_highest_slotid); -extern int nfs4_resize_slot_table(struct nfs4_slot_table *tbl, - u32 max_reqs, u32 ivalue); static inline bool is_ds_only_client(struct nfs_client *clp) @@ -358,7 +355,6 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); -extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fc65300172e1..0642e28704de 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5830,23 +5830,6 @@ out: return ret; } -int nfs4_resize_slot_table(struct nfs4_slot_table *tbl, - u32 max_reqs, u32 ivalue) -{ - int ret; - - if (max_reqs > NFS4_MAX_SLOT_TABLE) - max_reqs = NFS4_MAX_SLOT_TABLE; - ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); - if (ret) - return ret; - spin_lock(&tbl->slot_tbl_lock); - nfs4_shrink_slot_table(tbl, max_reqs); - tbl->max_slotid = max_reqs - 1; - spin_unlock(&tbl->slot_tbl_lock); - return 0; -} - /* Destroy the slot table */ static void nfs4_destroy_slot_tables(struct nfs4_session *session) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c14b2c7ac8a7..3940cd43fa98 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -302,7 +302,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp) clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ - clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); nfs41_setup_state_renewal(clp); } @@ -1905,14 +1904,6 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); -void nfs41_handle_recall_slot(struct nfs_client *clp) -{ - set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - dprintk("%s: scheduling slot recall for server %s\n", __func__, - clp->cl_hostname); - nfs4_schedule_state_manager(clp); -} - static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { @@ -2022,20 +2013,6 @@ out: return status; } -static int nfs4_recall_slot(struct nfs_client *clp) -{ - struct nfs4_slot_table *fc_tbl; - u32 new_size; - - if (!nfs4_has_session(clp)) - return 0; - nfs4_begin_drain_session(clp); - - fc_tbl = &clp->cl_session->fc_slot_table; - new_size = fc_tbl->server_highest_slotid + 1; - return nfs4_resize_slot_table(fc_tbl, new_size, 1); -} - static int nfs4_bind_conn_to_session(struct nfs_client *clp) { struct rpc_cred *cred; @@ -2066,7 +2043,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } -static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } static int nfs4_bind_conn_to_session(struct nfs_client *clp) { @@ -2126,15 +2102,6 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } - /* Recall session slots */ - if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { - section = "recall slot"; - status = nfs4_recall_slot(clp); - if (status < 0) - goto out_error; - continue; - } - /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { section = "reclaim reboot"; -- cgit v1.2.1 From ac0748359a55faf4618f5f0bd9f9bf967c41d218 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Nov 2012 09:06:11 -0500 Subject: NFSv4.1: CB_RECALL_SLOT must schedule a sequence op after updating targets RFC5661 requires us to make sure that the server knows we've updated our slot table size by sending at least one SEQUENCE op containing the new 'highest_slotid' value. We can do so using the 'CHECK_LEASE' functionality of the state manager. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 1 + fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4state.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 8610bd1d136d..f99faad78c72 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -570,6 +570,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); + nfs41_server_notify_target_slotid_update(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fa1a055a8fe9..0a109ec75e69 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -334,6 +334,7 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); +extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3940cd43fa98..896be2126f7e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1904,6 +1904,18 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); +static void nfs41_ping_server(struct nfs_client *clp) +{ + /* Use CHECK_LEASE to ping the server with a SEQUENCE */ + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} + +void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { -- cgit v1.2.1 From 69d206b5b39e298755b60e8e7056cb240182eb95 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Nov 2012 13:21:02 -0500 Subject: NFSv4.1: If slot allocation fails due to OOM, retry more quickly If the NFSv4.1 session slot allocation fails due to an ENOMEM condition, then set the task->tk_timeout to 1/4 second to ensure that we do retry the slot allocation more quickly. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0642e28704de..e9e4d6393f1b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -662,7 +662,7 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, return slot; p = &slot->next; } - return NULL; + return ERR_PTR(-ENOMEM); } /* @@ -676,7 +676,7 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, */ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) { - struct nfs4_slot *ret = NULL; + struct nfs4_slot *ret = ERR_PTR(-EBUSY); u32 slotid; dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", @@ -686,7 +686,7 @@ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) if (slotid > tbl->max_slotid) goto out; ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); - if (ret == NULL) + if (IS_ERR(ret)) goto out; __set_bit(slotid, tbl->used_slots); if (slotid > tbl->highest_used_slotid || @@ -698,7 +698,7 @@ static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) out: dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, - ret ? ret->slot_nr : -1); + !IS_ERR(ret) ? ret->slot_nr : -1); return ret; } @@ -727,6 +727,8 @@ int nfs41_setup_sequence(struct nfs4_session *session, tbl = &session->fc_slot_table; + task->tk_timeout = 0; + spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { @@ -746,7 +748,10 @@ int nfs41_setup_sequence(struct nfs4_session *session, } slot = nfs4_alloc_slot(tbl); - if (slot == NULL) { + if (IS_ERR(slot)) { + /* If out of memory, try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); dprintk("<-- %s: no free slots\n", __func__); @@ -5778,7 +5783,7 @@ static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, { if (max_reqs <= tbl->max_slots) return 0; - if (nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)) + if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) return 0; return -ENOMEM; } -- cgit v1.2.1 From 5d63360dd8daffc2bc86531e9a44ff9d4881b102 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Nov 2012 13:09:38 -0500 Subject: NFSv4.1: Clean up session draining Coalesce nfs4_check_drain_bc_complete and nfs4_check_drain_fc_complete into a single function that can be called when the slot table is known to be empty, then change nfs4_callback_free_slot() and nfs4_free_slot() to use it. Signed-off-by: Trond Myklebust --- fs/nfs/callback.h | 2 -- fs/nfs/callback_xdr.c | 2 +- fs/nfs/nfs4_fs.h | 8 ++++++++ fs/nfs/nfs4proc.c | 38 ++++++-------------------------------- fs/nfs/nfs4state.c | 10 ++++++++++ 5 files changed, 25 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index e75631e264f4..efd54f0a4c46 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -167,8 +167,6 @@ extern __be32 nfs4_callback_layoutrecall( struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps); -extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); - struct cb_devicenotifyitem { uint32_t cbd_notify_type; uint32_t cbd_layout_type; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 81e8c7d4c2e8..ea6a7b190e6b 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -762,7 +762,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * A single slot, so highest used slotid is either 0 or -1 */ tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_check_drain_bc_complete(session); + nfs4_session_drain_complete(session, tbl); spin_unlock(&tbl->slot_tbl_lock); } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 0a109ec75e69..16b19372c4ba 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -335,6 +335,14 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); + +extern void nfs4_session_drain_complete(struct nfs4_session *session, + struct nfs4_slot_table *tbl); + +static inline bool nfs4_session_draining(struct nfs4_session *session) +{ + return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); +} #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e9e4d6393f1b..0b0f11be40f9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -445,8 +445,10 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) u32 new_max = find_last_bit(tbl->used_slots, slotid); if (new_max < slotid) tbl->highest_used_slotid = new_max; - else + else { tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_session_drain_complete(tbl->session, tbl); + } } dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, slotid, tbl->highest_used_slotid); @@ -458,36 +460,6 @@ bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) return true; } -/* - * Signal state manager thread if session fore channel is drained - */ -static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) -{ - if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { - rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq, - nfs4_set_task_privileged, NULL); - return; - } - - if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT) - return; - - dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); - complete(&ses->fc_slot_table.complete); -} - -/* - * Signal state manager thread if session back channel is drained - */ -void nfs4_check_drain_bc_complete(struct nfs4_session *ses) -{ - if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || - ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT) - return; - dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); - complete(&ses->bc_slot_table.complete); -} - static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_session *session; @@ -504,7 +476,9 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slot); - nfs4_check_drain_fc_complete(session); + if (!nfs4_session_draining(session)) + rpc_wake_up_first(&tbl->slot_tbl_waitq, + nfs4_set_task_privileged, NULL); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 896be2126f7e..1fb3e6c6f993 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -271,6 +271,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp) } } +/* + * Signal state manager thread if session fore channel is drained + */ +void nfs4_session_drain_complete(struct nfs4_session *session, + struct nfs4_slot_table *tbl) +{ + if (nfs4_session_draining(session)) + complete(&tbl->complete); +} + static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) { spin_lock(&tbl->slot_tbl_lock); -- cgit v1.2.1 From 330212796756ca2752b2a70a83860e145b77487c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Nov 2012 13:13:29 -0500 Subject: NFSv4: Move nfs4_wait_clnt_recover and nfs4_client_recover_expired_lease nfs4_wait_clnt_recover and nfs4_client_recover_expired_lease are both generic state related functions. As such, they belong in nfs4state.c, and not nfs4proc.c Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 4 ++++ fs/nfs/nfs4proc.c | 36 ------------------------------------ fs/nfs/nfs4state.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 16b19372c4ba..2f6a9f9d9299 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -11,6 +11,8 @@ #if IS_ENABLED(CONFIG_NFS_V4) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + struct idmap; enum nfs4_client_state { @@ -360,6 +362,8 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); extern void nfs4_schedule_lease_recovery(struct nfs_client *); +extern int nfs4_wait_clnt_recover(struct nfs_client *clp); +extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0b0f11be40f9..d75e2a2576eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -70,8 +70,6 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) -#define NFS4_MAX_LOOP_ON_RECOVER (10) - struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -255,22 +253,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } -static int nfs4_wait_clnt_recover(struct nfs_client *clp) -{ - int res; - - might_sleep(); - - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (res) - return res; - - if (clp->cl_cons_state < 0) - return clp->cl_cons_state; - return 0; -} - static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) { int res = 0; @@ -1883,24 +1865,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_client_recover_expired_lease(struct nfs_client *clp) -{ - unsigned int loop; - int ret; - - for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { - ret = nfs4_wait_clnt_recover(clp); - if (ret != 0) - break; - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && - !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) - break; - nfs4_schedule_state_manager(clp); - ret = -EIO; - } - return ret; -} - static int nfs4_recover_expired_lease(struct nfs_server *server) { return nfs4_client_recover_expired_lease(server->nfs_client); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1fb3e6c6f993..1077b9698381 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1216,6 +1216,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (res) + return res; + + if (clp->cl_cons_state < 0) + return clp->cl_cons_state; + return 0; +} + +int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = nfs4_wait_clnt_recover(clp); + if (ret != 0) + break; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) + break; + nfs4_schedule_state_manager(clp); + ret = -EIO; + } + return ret; +} + /* * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN * @clp: client to process -- cgit v1.2.1 From 73e39aaa8366694450cd6034050f542f965e277d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Nov 2012 12:49:34 -0500 Subject: NFSv4.1: Cleanup move session slot management to fs/nfs/nfs4session.c NFSv4.1 session management is getting complex enough to deserve a separate file. Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/callback_proc.c | 1 + fs/nfs/internal.h | 2 - fs/nfs/nfs4_fs.h | 11 -- fs/nfs/nfs4client.c | 1 + fs/nfs/nfs4filelayoutdev.c | 1 + fs/nfs/nfs4proc.c | 415 +----------------------------------------- fs/nfs/nfs4session.c | 436 +++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4session.h | 35 ++++ 9 files changed, 477 insertions(+), 427 deletions(-) create mode 100644 fs/nfs/nfs4session.c create mode 100644 fs/nfs/nfs4session.h (limited to 'fs') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index b7db60897f91..cce2c057bd2d 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -24,7 +24,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o -nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o +nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f99faad78c72..c89b26bc9759 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -14,6 +14,7 @@ #include "delegation.h" #include "internal.h" #include "pnfs.h" +#include "nfs4session.h" #ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 05521cadac2e..8965a998b306 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -276,8 +276,6 @@ extern const u32 nfs41_maxwrite_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif -extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); - /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2f6a9f9d9299..cd3e3096b60a 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -249,19 +249,13 @@ extern int nfs4_setup_sequence(const struct nfs_server *server, extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); -extern void nfs4_destroy_session(struct nfs4_session *session); -extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); -extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); -extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, - u32 target_highest_slotid); - static inline bool is_ds_only_client(struct nfs_client *clp) { @@ -287,11 +281,6 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server, return 0; } -static inline int nfs4_init_session(struct nfs_server *server) -{ - return 0; -} - static inline bool is_ds_only_client(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 72717e67b34e..acc347268124 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -12,6 +12,7 @@ #include "internal.h" #include "callback.h" #include "delegation.h" +#include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 93e2530d7098..b720064bcd7f 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -33,6 +33,7 @@ #include #include "internal.h" +#include "nfs4session.h" #include "nfs4filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d75e2a2576eb..a0c35ab12a6b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -64,6 +63,8 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4session.h" + #define NFSDBG_FACILITY NFSDBG_PROC @@ -378,64 +379,6 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp #if defined(CONFIG_NFS_V4_1) -/* - * nfs4_shrink_slot_table - free retired slots from the slot table - */ -static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) -{ - struct nfs4_slot **p; - if (newsize >= tbl->max_slots) - return; - - p = &tbl->slots; - while (newsize--) - p = &(*p)->next; - while (*p) { - struct nfs4_slot *slot = *p; - - *p = slot->next; - kfree(slot); - tbl->max_slots--; - } -} - -/* - * nfs4_free_slot - free a slot and efficiently update slot table. - * - * freeing a slot is trivially done by clearing its respective bit - * in the bitmap. - * If the freed slotid equals highest_used_slotid we want to update it - * so that the server would be able to size down the slot table if needed, - * otherwise we know that the highest_used_slotid is still in use. - * When updating highest_used_slotid there may be "holes" in the bitmap - * so we need to scan down from highest_used_slotid to 0 looking for the now - * highest slotid in use. - * If none found, highest_used_slotid is set to NFS4_NO_SLOT. - * - * Must be called while holding tbl->slot_tbl_lock - */ -static void -nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) -{ - u32 slotid = slot->slot_nr; - - /* clear used bit in bitmap */ - __clear_bit(slotid, tbl->used_slots); - - /* update highest_used_slotid when it is freed */ - if (slotid == tbl->highest_used_slotid) { - u32 new_max = find_last_bit(tbl->used_slots, slotid); - if (new_max < slotid) - tbl->highest_used_slotid = new_max; - else { - tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(tbl->session, tbl); - } - } - dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, - slotid, tbl->highest_used_slotid); -} - bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) { rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); @@ -465,56 +408,6 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) res->sr_slot = NULL; } -/* Update the client's idea of target_highest_slotid */ -static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, - u32 target_highest_slotid) -{ - unsigned int max_slotid, i; - - if (tbl->target_highest_slotid == target_highest_slotid) - return; - tbl->target_highest_slotid = target_highest_slotid; - tbl->generation++; - - max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, tbl->target_highest_slotid); - for (i = tbl->max_slotid + 1; i <= max_slotid; i++) - rpc_wake_up_next(&tbl->slot_tbl_waitq); - tbl->max_slotid = max_slotid; -} - -void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, - u32 target_highest_slotid) -{ - spin_lock(&tbl->slot_tbl_lock); - nfs41_set_target_slotid_locked(tbl, target_highest_slotid); - spin_unlock(&tbl->slot_tbl_lock); -} - -static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, - u32 highest_slotid) -{ - if (tbl->server_highest_slotid == highest_slotid) - return; - if (tbl->highest_used_slotid > highest_slotid) - return; - /* Deallocate slots */ - nfs4_shrink_slot_table(tbl, highest_slotid + 1); - tbl->server_highest_slotid = highest_slotid; -} - -static void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, - struct nfs4_slot *slot, - struct nfs4_sequence_res *res) -{ - spin_lock(&tbl->slot_tbl_lock); - if (tbl->generation != slot->generation) - goto out; - nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); - nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); -out: - spin_unlock(&tbl->slot_tbl_lock); -} - static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; @@ -585,79 +478,6 @@ static int nfs4_sequence_done(struct rpc_task *task, return nfs41_sequence_done(task, res); } -static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, - u32 slotid, u32 seq_init, gfp_t gfp_mask) -{ - struct nfs4_slot *slot; - - slot = kzalloc(sizeof(*slot), gfp_mask); - if (slot) { - slot->table = tbl; - slot->slot_nr = slotid; - slot->seq_nr = seq_init; - } - return slot; -} - -static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, - u32 slotid, u32 seq_init, gfp_t gfp_mask) -{ - struct nfs4_slot **p, *slot; - - p = &tbl->slots; - for (;;) { - if (*p == NULL) { - *p = nfs4_new_slot(tbl, tbl->max_slots, - seq_init, gfp_mask); - if (*p == NULL) - break; - tbl->max_slots++; - } - slot = *p; - if (slot->slot_nr == slotid) - return slot; - p = &slot->next; - } - return ERR_PTR(-ENOMEM); -} - -/* - * nfs4_alloc_slot - efficiently look for a free slot - * - * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. - * If found, we mark the slot as used, update the highest_used_slotid, - * and respectively set up the sequence operation args. - * - * Note: must be called with under the slot_tbl_lock. - */ -static struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) -{ - struct nfs4_slot *ret = ERR_PTR(-EBUSY); - u32 slotid; - - dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", - __func__, tbl->used_slots[0], tbl->highest_used_slotid, - tbl->max_slotid + 1); - slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); - if (slotid > tbl->max_slotid) - goto out; - ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); - if (IS_ERR(ret)) - goto out; - __set_bit(slotid, tbl->used_slots); - if (slotid > tbl->highest_used_slotid || - tbl->highest_used_slotid == NFS4_NO_SLOT) - tbl->highest_used_slotid = slotid; - ret->renewal_time = jiffies; - ret->generation = tbl->generation; - -out: - dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", - __func__, tbl->used_slots[0], tbl->highest_used_slotid, - !IS_ERR(ret) ? ret->slot_nr : -1); - return ret; -} - static void nfs41_init_sequence(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { @@ -5716,143 +5536,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } -static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, - u32 max_reqs, u32 ivalue) -{ - if (max_reqs <= tbl->max_slots) - return 0; - if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) - return 0; - return -ENOMEM; -} - -static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, - u32 server_highest_slotid, - u32 ivalue) -{ - struct nfs4_slot **p; - - nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); - p = &tbl->slots; - while (*p) { - (*p)->seq_nr = ivalue; - p = &(*p)->next; - } - tbl->highest_used_slotid = NFS4_NO_SLOT; - tbl->target_highest_slotid = server_highest_slotid; - tbl->server_highest_slotid = server_highest_slotid; - tbl->max_slotid = server_highest_slotid; -} - -/* - * (re)Initialise a slot table - */ -static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, - u32 max_reqs, u32 ivalue) -{ - int ret; - - dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, - max_reqs, tbl->max_slots); - - if (max_reqs > NFS4_MAX_SLOT_TABLE) - max_reqs = NFS4_MAX_SLOT_TABLE; - - ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); - if (ret) - goto out; - - spin_lock(&tbl->slot_tbl_lock); - nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); - spin_unlock(&tbl->slot_tbl_lock); - - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, - tbl, tbl->slots, tbl->max_slots); -out: - dprintk("<-- %s: return %d\n", __func__, ret); - return ret; -} - -/* Destroy the slot table */ -static void nfs4_destroy_slot_tables(struct nfs4_session *session) -{ - nfs4_shrink_slot_table(&session->fc_slot_table, 0); - nfs4_shrink_slot_table(&session->bc_slot_table, 0); -} - -/* - * Initialize or reset the forechannel and backchannel tables - */ -static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) -{ - struct nfs4_slot_table *tbl; - int status; - - dprintk("--> %s\n", __func__); - /* Fore channel */ - tbl = &ses->fc_slot_table; - tbl->session = ses; - status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ - return status; - /* Back channel */ - tbl = &ses->bc_slot_table; - tbl->session = ses; - status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); - if (status && tbl->slots == NULL) - /* Fore and back channel share a connection so get - * both slot tables or neither */ - nfs4_destroy_slot_tables(ses); - return status; -} - -struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) -{ - struct nfs4_session *session; - struct nfs4_slot_table *tbl; - - session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); - if (!session) - return NULL; - - tbl = &session->fc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); - init_completion(&tbl->complete); - - tbl = &session->bc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); - init_completion(&tbl->complete); - - session->session_state = 1<clp = clp; - return session; -} - -void nfs4_destroy_session(struct nfs4_session *session) -{ - struct rpc_xprt *xprt; - struct rpc_cred *cred; - - cred = nfs4_get_exchange_id_cred(session->clp); - nfs4_proc_destroy_session(session, cred); - if (cred) - put_rpccred(cred); - - rcu_read_lock(); - xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); - rcu_read_unlock(); - dprintk("%s Destroy backchannel for xprt %p\n", - __func__, xprt); - xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); - nfs4_destroy_slot_tables(session); - kfree(session); -} - /* * Initialize the values to be used by the client in CREATE_SESSION * If nfs4_init_session set the fore channel request and response sizes, @@ -6046,100 +5729,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, return status; } -/* - * With sessions, the client is not marked ready until after a - * successful EXCHANGE_ID and CREATE_SESSION. - * - * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate - * other versions of NFS can be tried. - */ -static int nfs41_check_session_ready(struct nfs_client *clp) -{ - int ret; - - if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { - ret = nfs4_client_recover_expired_lease(clp); - if (ret) - return ret; - } - if (clp->cl_cons_state < NFS_CS_READY) - return -EPROTONOSUPPORT; - smp_rmb(); - return 0; -} - -int nfs4_init_session(struct nfs_server *server) -{ - struct nfs_client *clp = server->nfs_client; - struct nfs4_session *session; - unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; - unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; - - if (!nfs4_has_session(clp)) - return 0; - - if (server->rsize != 0) - target_max_resp_sz = server->rsize; - target_max_resp_sz += nfs41_maxread_overhead; - - if (server->wsize != 0) - target_max_rqst_sz = server->wsize; - target_max_rqst_sz += nfs41_maxwrite_overhead; - - session = clp->cl_session; - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - /* Initialise targets and channel attributes */ - session->fc_target_max_rqst_sz = target_max_rqst_sz; - session->fc_attrs.max_rqst_sz = target_max_rqst_sz; - session->fc_target_max_resp_sz = target_max_resp_sz; - session->fc_attrs.max_resp_sz = target_max_resp_sz; - } else { - /* Just adjust the targets */ - if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { - session->fc_target_max_rqst_sz = target_max_rqst_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - if (target_max_resp_sz > session->fc_target_max_resp_sz) { - session->fc_target_max_resp_sz = target_max_resp_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - } - spin_unlock(&clp->cl_lock); - - if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) - nfs4_schedule_lease_recovery(clp); - - return nfs41_check_session_ready(clp); -} - -int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) -{ - struct nfs4_session *session = clp->cl_session; - int ret; - - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - /* - * Do not set NFS_CS_CHECK_LEASE_TIME instead set the - * DS lease to be equal to the MDS lease. - */ - clp->cl_lease_time = lease_time; - clp->cl_last_renewal = jiffies; - } - spin_unlock(&clp->cl_lock); - - ret = nfs41_check_session_ready(clp); - if (ret) - return ret; - /* Test for the DS role */ - if (!is_ds_client(clp)) - return -ENODEV; - return 0; -} -EXPORT_SYMBOL_GPL(nfs4_init_ds_session); - - /* * Renew the cl_session lease. */ diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c new file mode 100644 index 000000000000..701170293ceb --- /dev/null +++ b/fs/nfs/nfs4session.c @@ -0,0 +1,436 @@ +/* + * fs/nfs/nfs4session.c + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) +{ + struct nfs4_slot **p; + if (newsize >= tbl->max_slots) + return; + + p = &tbl->slots; + while (newsize--) + p = &(*p)->next; + while (*p) { + struct nfs4_slot *slot = *p; + + *p = slot->next; + kfree(slot); + tbl->max_slots--; + } +} + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. + * + * Must be called while holding tbl->slot_tbl_lock + */ +void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) +{ + u32 slotid = slot->slot_nr; + + /* clear used bit in bitmap */ + __clear_bit(slotid, tbl->used_slots); + + /* update highest_used_slotid when it is freed */ + if (slotid == tbl->highest_used_slotid) { + u32 new_max = find_last_bit(tbl->used_slots, slotid); + if (new_max < slotid) + tbl->highest_used_slotid = new_max; + else { + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_session_drain_complete(tbl->session, tbl); + } + } + dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, + slotid, tbl->highest_used_slotid); +} + +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot *slot; + + slot = kzalloc(sizeof(*slot), gfp_mask); + if (slot) { + slot->table = tbl; + slot->slot_nr = slotid; + slot->seq_nr = seq_init; + } + return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot **p, *slot; + + p = &tbl->slots; + for (;;) { + if (*p == NULL) { + *p = nfs4_new_slot(tbl, tbl->max_slots, + seq_init, gfp_mask); + if (*p == NULL) + break; + tbl->max_slots++; + } + slot = *p; + if (slot->slot_nr == slotid) + return slot; + p = &slot->next; + } + return ERR_PTR(-ENOMEM); +} + +/* + * nfs4_alloc_slot - efficiently look for a free slot + * + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * + * Note: must be called with under the slot_tbl_lock. + */ +struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *ret = ERR_PTR(-EBUSY); + u32 slotid; + + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + tbl->max_slotid + 1); + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); + if (slotid > tbl->max_slotid) + goto out; + ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + if (IS_ERR(ret)) + goto out; + __set_bit(slotid, tbl->used_slots); + if (slotid > tbl->highest_used_slotid || + tbl->highest_used_slotid == NFS4_NO_SLOT) + tbl->highest_used_slotid = slotid; + ret->renewal_time = jiffies; + ret->generation = tbl->generation; + +out: + dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + !IS_ERR(ret) ? ret->slot_nr : -1); + return ret; +} + +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + if (max_reqs <= tbl->max_slots) + return 0; + if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) + return 0; + return -ENOMEM; +} + +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, + u32 server_highest_slotid, + u32 ivalue) +{ + struct nfs4_slot **p; + + nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); + p = &tbl->slots; + while (*p) { + (*p)->seq_nr = ivalue; + p = &(*p)->next; + } + tbl->highest_used_slotid = NFS4_NO_SLOT; + tbl->target_highest_slotid = server_highest_slotid; + tbl->server_highest_slotid = server_highest_slotid; + tbl->max_slotid = server_highest_slotid; +} + +/* + * (re)Initialise a slot table + */ +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + int ret; + + dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + max_reqs, tbl->max_slots); + + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + goto out; + + spin_lock(&tbl->slot_tbl_lock); + nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); + spin_unlock(&tbl->slot_tbl_lock); + + dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* Destroy the slot table */ +static void nfs4_destroy_slot_tables(struct nfs4_session *session) +{ + nfs4_shrink_slot_table(&session->fc_slot_table, 0); + nfs4_shrink_slot_table(&session->bc_slot_table, 0); +} + +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + unsigned int max_slotid, i; + + if (tbl->target_highest_slotid == target_highest_slotid) + return; + tbl->target_highest_slotid = target_highest_slotid; + tbl->generation++; + + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, tbl->target_highest_slotid); + for (i = tbl->max_slotid + 1; i <= max_slotid; i++) + rpc_wake_up_next(&tbl->slot_tbl_waitq); + tbl->max_slotid = max_slotid; +} + +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + spin_lock(&tbl->slot_tbl_lock); + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, + u32 highest_slotid) +{ + if (tbl->server_highest_slotid == highest_slotid) + return; + if (tbl->highest_used_slotid > highest_slotid) + return; + /* Deallocate slots */ + nfs4_shrink_slot_table(tbl, highest_slotid + 1); + tbl->server_highest_slotid = highest_slotid; +} + +void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res) +{ + spin_lock(&tbl->slot_tbl_lock); + if (tbl->generation != slot->generation) + goto out; + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); +out: + spin_unlock(&tbl->slot_tbl_lock); +} + +/* + * Initialize or reset the forechannel and backchannel tables + */ +int nfs4_setup_session_slot_tables(struct nfs4_session *ses) +{ + struct nfs4_slot_table *tbl; + int status; + + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; + /* Back channel */ + tbl = &ses->bc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); + return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + + tbl = &session->fc_slot_table; + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + init_completion(&tbl->complete); + + tbl = &session->bc_slot_table; + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + init_completion(&tbl->complete); + + session->session_state = 1<clp = clp; + return session; +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ + struct rpc_xprt *xprt; + struct rpc_cred *cred; + + cred = nfs4_get_exchange_id_cred(session->clp); + nfs4_proc_destroy_session(session, cred); + if (cred) + put_rpccred(cred); + + rcu_read_lock(); + xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); + dprintk("%s Destroy backchannel for xprt %p\n", + __func__, xprt); + xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + nfs4_destroy_slot_tables(session); + kfree(session); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ + int ret; + + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + ret = nfs4_client_recover_expired_lease(clp); + if (ret) + return ret; + } + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + smp_rmb(); + return 0; +} + +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_session *session; + unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; + unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; + + if (!nfs4_has_session(clp)) + return 0; + + if (server->rsize != 0) + target_max_resp_sz = server->rsize; + target_max_resp_sz += nfs41_maxread_overhead; + + if (server->wsize != 0) + target_max_rqst_sz = server->wsize; + target_max_rqst_sz += nfs41_maxwrite_overhead; + + session = clp->cl_session; + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* Initialise targets and channel attributes */ + session->fc_target_max_rqst_sz = target_max_rqst_sz; + session->fc_attrs.max_rqst_sz = target_max_rqst_sz; + session->fc_target_max_resp_sz = target_max_resp_sz; + session->fc_attrs.max_resp_sz = target_max_resp_sz; + } else { + /* Just adjust the targets */ + if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { + session->fc_target_max_rqst_sz = target_max_rqst_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } + if (target_max_resp_sz > session->fc_target_max_resp_sz) { + session->fc_target_max_resp_sz = target_max_resp_sz; + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + } + } + spin_unlock(&clp->cl_lock); + + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + nfs4_schedule_lease_recovery(clp); + + return nfs41_check_session_ready(clp); +} + +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the + * DS lease to be equal to the MDS lease. + */ + clp->cl_lease_time = lease_time; + clp->cl_last_renewal = jiffies; + } + spin_unlock(&clp->cl_lock); + + ret = nfs41_check_session_ready(clp); + if (ret) + return ret; + /* Test for the DS role */ + if (!is_ds_client(clp)) + return -ENODEV; + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h new file mode 100644 index 000000000000..cb47b1eb0886 --- /dev/null +++ b/fs/nfs/nfs4session.h @@ -0,0 +1,35 @@ +/* + * fs/nfs/nfs4session.h + * + * Copyright (c) 2012 Trond Myklebust + * + */ +#ifndef __LINUX_FS_NFS_NFS4SESSION_H +#define __LINUX_FS_NFS_NFS4SESSION_H + +#if defined(CONFIG_NFS_V4_1) +extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); +extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); + +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid); +extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res); + +extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); + +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); + +#else /* defined(CONFIG_NFS_V4_1) */ + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} + +#endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* __LINUX_FS_NFS_NFS4SESSION_H */ -- cgit v1.2.1 From 76e697ba7e8d187f50e385d21a2b2f1709a62c14 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Nov 2012 14:20:49 -0500 Subject: NFSv4.1: Move slot table and session struct definitions to nfs4session.h Clean up. Gather NFSv4.1 slot definitions in fs/nfs/nfs4session.h. Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 1 + fs/nfs/callback_xdr.c | 1 + fs/nfs/internal.h | 21 -------- fs/nfs/nfs4_fs.h | 12 ----- fs/nfs/nfs4filelayout.c | 1 + fs/nfs/nfs4session.h | 101 +++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4state.c | 1 + fs/nfs/nfs4xdr.c | 1 + fs/nfs/super.c | 1 + 9 files changed, 107 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index f1027b06a1a9..4fa788c93f46 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -40,6 +40,7 @@ #include #include "../pnfs.h" +#include "../nfs4session.h" #include "../internal.h" #include "blocklayout.h" diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index ea6a7b190e6b..59461c957d9d 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -16,6 +16,7 @@ #include "nfs4_fs.h" #include "callback.h" #include "internal.h" +#include "nfs4session.h" #define CB_OP_TAGLEN_MAXSZ (512) #define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8965a998b306..9bdbfc3884a9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -18,27 +18,6 @@ struct nfs_string; */ #define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) -/* - * Determine if sessions are in use. - */ -static inline int nfs4_has_session(const struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4_1 - if (clp->cl_session) - return 1; -#endif /* CONFIG_NFS_V4_1 */ - return 0; -} - -static inline int nfs4_has_persistent_session(const struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp)) - return (clp->cl_session->flags & SESSION4_PERSIST); -#endif /* CONFIG_NFS_V4_1 */ - return 0; -} - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index cd3e3096b60a..322bd0168ebf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -29,11 +29,6 @@ enum nfs4_client_state { NFS4CLNT_BIND_CONN_TO_SESSION, }; -enum nfs4_session_state { - NFS4_SESSION_INITING, - NFS4_SESSION_DRAINING, -}; - #define NFS4_RENEW_TIMEOUT 0x01 #define NFS4_RENEW_DELEGATION_CB 0x02 @@ -327,13 +322,6 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); -extern void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl); - -static inline bool nfs4_session_draining(struct nfs4_session *session) -{ - return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); -} #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index bfb28fa38e74..591a1a7f8f94 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -35,6 +35,7 @@ #include +#include "nfs4session.h" #include "internal.h" #include "delegation.h" #include "nfs4filelayout.h" diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index cb47b1eb0886..e96323ff1d95 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -7,6 +7,68 @@ #ifndef __LINUX_FS_NFS_NFS4SESSION_H #define __LINUX_FS_NFS_NFS4SESSION_H +/* maximum number of slots to use */ +#define NFS4_DEF_SLOT_TABLE_SIZE (16U) +#define NFS4_MAX_SLOT_TABLE (256U) +#define NFS4_NO_SLOT ((u32)-1) + +#if IS_ENABLED(CONFIG_NFS_V4) + +/* Sessions slot seqid */ +struct nfs4_slot { + struct nfs4_slot_table *table; + struct nfs4_slot *next; + unsigned long generation; + unsigned long renewal_time; + u32 slot_nr; + u32 seq_nr; +}; + +/* Sessions */ +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +struct nfs4_slot_table { + struct nfs4_session *session; /* Parent session */ + struct nfs4_slot *slots; /* seqid per slot */ + unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ + spinlock_t slot_tbl_lock; + struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + u32 max_slots; /* # slots in table */ + u32 max_slotid; /* Max allowed slotid value */ + u32 highest_used_slotid; /* sent to server on each SEQ. + * op for dynamic resizing */ + u32 target_highest_slotid; /* Server max_slot target */ + u32 server_highest_slotid; /* Server highest slotid */ + unsigned long generation; /* Generation counter for + target_highest_slotid */ + struct completion complete; +}; + +/* + * Session related parameters + */ +struct nfs4_session { + struct nfs4_sessionid sess_id; + u32 flags; + unsigned long session_state; + u32 hash_alg; + u32 ssv_len; + + /* The fore and back channel */ + struct nfs4_channel_attrs fc_attrs; + struct nfs4_slot_table fc_slot_table; + struct nfs4_channel_attrs bc_attrs; + struct nfs4_slot_table bc_slot_table; + struct nfs_client *clp; + /* Create session arguments */ + unsigned int fc_target_max_rqst_sz; + unsigned int fc_target_max_resp_sz; +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + #if defined(CONFIG_NFS_V4_1) extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); @@ -24,6 +86,31 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); +extern void nfs4_session_drain_complete(struct nfs4_session *session, + struct nfs4_slot_table *tbl); + +static inline bool nfs4_session_draining(struct nfs4_session *session) +{ + return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); +} + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + if (clp->cl_session) + return 1; + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); + return 0; +} + #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_server *server) @@ -31,5 +118,19 @@ static inline int nfs4_init_session(struct nfs_server *server) return 0; } +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + return 0; +} + #endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ #endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1077b9698381..1402283d152d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -57,6 +57,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a67040f51597..e786dc7582b1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -56,6 +56,7 @@ #include "nfs4_fs.h" #include "internal.h" +#include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 652d3f7176a9..e12cea4b36a5 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "nfs4session.h" #include "pnfs.h" #include "nfs.h" -- cgit v1.2.1 From 0ca3f4825ac92a10aa8f6534f765c44f22778dd3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Nov 2012 22:34:45 -0500 Subject: NFSv4.1: Set the maximum slot table size to 1024 slots This means that we end up statically allocating 128 bytes for the bitmap on each slot table. For a server that supports 1MB write and read I/O sizes this means that we can completely fill the maximum 1GB TCP send/receive windows. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4session.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index e96323ff1d95..bdd14a60722b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -9,7 +9,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (16U) -#define NFS4_MAX_SLOT_TABLE (256U) +#define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) #if IS_ENABLED(CONFIG_NFS_V4) -- cgit v1.2.1 From c10e449827e6008ef5a4a71c0247c7eb73948e1b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Nov 2012 16:16:54 -0500 Subject: NFSv4.1: Ping server when our session table limits are too high If the server requests a lower target_highest_slotid, then ensure that we ping it with at least one RPC call containing an appropriate SEQUENCE op. This ensures that the server won't need to send a recall callback in order to shrink the slot table. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 20 +++++++++++++++++--- fs/nfs/nfs4state.c | 5 +++++ 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 322bd0168ebf..8fe155ba16d1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -321,6 +321,7 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); +extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a0c35ab12a6b..ecd4ed3a4f65 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -389,6 +389,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot_table *tbl; + bool send_new_highest_used_slotid = false; if (!res->sr_slot) { /* just wake up the next guy waiting since @@ -400,12 +401,25 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) session = tbl->session; spin_lock(&tbl->slot_tbl_lock); + /* Be nice to the server: try to ensure that the last transmitted + * value for highest_user_slotid <= target_highest_slotid + */ + if (tbl->highest_used_slotid > tbl->target_highest_slotid) + send_new_highest_used_slotid = true; + nfs4_free_slot(tbl, res->sr_slot); - if (!nfs4_session_draining(session)) - rpc_wake_up_first(&tbl->slot_tbl_waitq, - nfs4_set_task_privileged, NULL); + + if (tbl->highest_used_slotid != NFS4_NO_SLOT) + send_new_highest_used_slotid = false; + if (!nfs4_session_draining(session)) { + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, + nfs4_set_task_privileged, NULL) != NULL) + send_new_highest_used_slotid = false; + } spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; + if (send_new_highest_used_slotid) + nfs41_server_notify_highest_slotid_update(session->clp); } static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1402283d152d..c137421f2123 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1961,6 +1961,11 @@ void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) nfs41_ping_server(clp); } +void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { -- cgit v1.2.1 From 6ba7db3420c0dbf3ede16f19a593e6a80edc043f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 22 Oct 2012 20:07:20 -0400 Subject: NFSv4.1: Use nfs41_setup_sequence where appropriate There is no point in using nfs4_setup_sequence or nfs4_sequence_done in pure NFSv4.1 functions. We already know that those have sessions... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ecd4ed3a4f65..39d24158f97f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -597,10 +597,11 @@ struct nfs41_call_sync_data { static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; + struct nfs4_session *session = nfs4_get_session(data->seq_server); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - if (nfs4_setup_sequence(data->seq_server, data->seq_args, + if (nfs41_setup_sequence(session, data->seq_args, data->seq_res, task)) return; rpc_call_start(task); @@ -6018,6 +6019,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct nfs4_session *session = nfs4_get_session(server); dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -6025,7 +6027,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) * However, that is not so catastrophic, and there seems * to be no way to prevent it completely. */ - if (nfs4_setup_sequence(server, &lgp->args.seq_args, + if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, @@ -6047,7 +6049,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; switch (task->tk_status) { @@ -6211,7 +6213,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_done(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -6360,8 +6362,9 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); + struct nfs4_session *session = nfs4_get_session(server); - if (nfs4_setup_sequence(server, &data->args.seq_args, + if (nfs41_setup_sequence(session, &data->args.seq_args, &data->res.seq_res, task)) return; rpc_call_start(task); @@ -6373,7 +6376,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); - if (!nfs4_sequence_done(task, &data->res.seq_res)) + if (!nfs41_sequence_done(task, &data->res.seq_res)) return; switch (task->tk_status) { /* Just ignore these failures */ -- cgit v1.2.1 From d9afbd1b0889e7da6742e9c67ccc7becc4161f65 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 22 Oct 2012 20:28:44 -0400 Subject: NFSv4.1: Simplify the sequence setup Nobody calls nfs4_setup_sequence or nfs41_setup_sequence without also calling rpc_call_start() on success. This commit therefore folds the rpc_call_start call into nfs41_setup_sequence(). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4filelayout.c | 30 +++++------- fs/nfs/nfs4proc.c | 125 +++++++++++++++++++----------------------------- 3 files changed, 62 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8fe155ba16d1..8022adec34cd 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -273,6 +273,7 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) { + rpc_call_start(task); return 0; } diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 591a1a7f8f94..1e42413fab8f 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -307,12 +307,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } rdata->read_done_cb = filelayout_read_done_cb; - if (nfs41_setup_sequence(rdata->ds_clp->cl_session, - &rdata->args.seq_args, &rdata->res.seq_res, - task)) - return; - - rpc_call_start(task); + nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, + &rdata->res.seq_res, + task); } static void filelayout_read_call_done(struct rpc_task *task, void *data) @@ -409,12 +407,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - if (nfs41_setup_sequence(wdata->ds_clp->cl_session, - &wdata->args.seq_args, &wdata->res.seq_res, - task)) - return; - - rpc_call_start(task); + nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); } static void filelayout_write_call_done(struct rpc_task *task, void *data) @@ -450,12 +446,10 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - if (nfs41_setup_sequence(wdata->ds_clp->cl_session, - &wdata->args.seq_args, &wdata->res.seq_res, - task)) - return; - - rpc_call_start(task); + nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); } static void filelayout_write_commit_done(struct rpc_task *task, void *data) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 39d24158f97f..23b0c2fcb052 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -513,7 +513,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, dprintk("--> %s\n", __func__); /* slot already allocated? */ if (res->sr_slot != NULL) - return 0; + goto out_success; tbl = &session->fc_slot_table; @@ -563,6 +563,8 @@ int nfs41_setup_sequence(struct nfs4_session *session, * set to 1 if an rpc level failure occurs. */ res->sr_status = 1; +out_success: + rpc_call_start(task); return 0; } EXPORT_SYMBOL_GPL(nfs41_setup_sequence); @@ -575,8 +577,10 @@ int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_session *session = nfs4_get_session(server); int ret = 0; - if (session == NULL) + if (session == NULL) { + rpc_call_start(task); goto out; + } dprintk("--> %s clp %p session %p sr_slot %d\n", __func__, session->clp, session, res->sr_slot ? @@ -601,10 +605,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - if (nfs41_setup_sequence(session, data->seq_args, - data->seq_res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); } static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) @@ -1485,8 +1486,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) &data->o_res.seq_res, task) != 0) nfs_release_seqid(data->o_arg.seqid); - else - rpc_call_start(task); return; unlock_no_action: rcu_read_unlock(); @@ -2192,8 +2191,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); - else - rpc_call_start(task); out: dprintk("%s: done!\n", __func__); } @@ -2932,12 +2929,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->dir), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->dir), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) @@ -2965,12 +2960,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->old_dir), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, @@ -3459,12 +3452,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3525,22 +3516,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) @@ -4187,11 +4174,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_setup_sequence(d_data->res.server, - &d_data->args.seq_args, - &d_data->res.seq_res, task)) - return; - rpc_call_start(task); + nfs4_setup_sequence(d_data->res.server, + &d_data->args.seq_args, + &d_data->res.seq_res, + task); } #endif /* CONFIG_NFS_V4_1 */ @@ -4445,8 +4431,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); - else - rpc_call_start(task); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4601,10 +4585,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs4_setup_sequence(data->server, &data->arg.seq_args, &data->res.seq_res, - task) == 0) { - rpc_call_start(task); + task) == 0) return; - } nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); @@ -5462,7 +5444,6 @@ struct nfs4_get_lease_time_data { static void nfs4_get_lease_time_prepare(struct rpc_task *task, void *calldata) { - int ret; struct nfs4_get_lease_time_data *data = (struct nfs4_get_lease_time_data *)calldata; @@ -5470,12 +5451,10 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); /* just setup sequence, do not trigger session recovery since we're invoked within one */ - ret = nfs41_setup_sequence(data->clp->cl_session, - &data->args->la_seq_args, - &data->res->lr_seq_res, task); - - if (ret != -EAGAIN) - rpc_call_start(task); + nfs41_setup_sequence(data->clp->cl_session, + &data->args->la_seq_args, + &data->res->lr_seq_res, + task); dprintk("<-- %s\n", __func__); } @@ -5809,9 +5788,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs41_setup_sequence(clp->cl_session, args, res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(clp->cl_session, args, res, task); } static void nfs41_sequence_prepare_privileged(struct rpc_task *task, void *data) @@ -5914,12 +5891,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) struct nfs4_reclaim_complete_data *calldata = data; rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - if (nfs41_setup_sequence(calldata->clp->cl_session, - &calldata->arg.seq_args, - &calldata->res.seq_res, task)) - return; - - rpc_call_start(task); + nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task); } static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) @@ -6034,9 +6009,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) NFS_I(lgp->args.inode)->layout, lgp->args.ctx->state)) { rpc_exit(task, NFS4_OK); - return; } - rpc_call_start(task); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -6200,10 +6173,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, - &lrp->res.seq_res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(lrp->clp->cl_session, + &lrp->args.seq_args, + &lrp->res.seq_res, + task); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) @@ -6364,10 +6337,10 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(data->args.inode); struct nfs4_session *session = nfs4_get_session(server); - if (nfs41_setup_sequence(session, &data->args.seq_args, - &data->res.seq_res, task)) - return; - rpc_call_start(task); + nfs41_setup_sequence(session, + &data->args.seq_args, + &data->res.seq_res, + task); } static void -- cgit v1.2.1 From fd0c09537a8494e9dccf3856b90058e1f97f1d62 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Nov 2012 14:43:38 -0400 Subject: NFSv4: Simplify the NFSv4/v4.1 synchronous call switch We shouldn't need to pass the 'cache_reply' parameter if we initialise the sequence_args/sequence_res in the caller. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 12 ------------ fs/nfs/nfs4_fs.h | 3 +-- fs/nfs/nfs4proc.c | 15 +++++++-------- 3 files changed, 8 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9bdbfc3884a9..fb994471bd32 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -451,18 +451,6 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr, rpc_authflavor_t authflavour); -extern int _nfs4_call_sync(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply); -extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8022adec34cd..4f0cdc1b7148 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -39,8 +39,7 @@ struct nfs4_minor_version_ops { struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply); + struct nfs4_sequence_res *res); bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 23b0c2fcb052..4aaaa3ba3088 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -664,14 +664,13 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, return ret; } +static int _nfs4_call_sync_session(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply) + struct nfs4_sequence_res *res) { - nfs41_init_sequence(args, res, cache_reply); return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0); } @@ -689,18 +688,17 @@ static int nfs4_sequence_done(struct rpc_task *task, } #endif /* CONFIG_NFS_V4_1 */ +static int _nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply) + struct nfs4_sequence_res *res) { - nfs41_init_sequence(args, res, cache_reply); return rpc_call_sync(clnt, msg, 0); } -static inline +static int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -708,8 +706,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { + nfs41_init_sequence(args, res, cache_reply); return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, - args, res, cache_reply); + args, res); } static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) -- cgit v1.2.1 From 7b939a3f44293516c4225f640e8c4b9200beeabc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Nov 2012 15:19:46 -0400 Subject: NFSv4.1: Clean up nfs41_setup_sequence Move all the sleep-and-exit cases into a single section of code. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4aaaa3ba3088..87525eb60bd8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -523,18 +523,14 @@ int nfs41_setup_sequence(struct nfs4_session *session, if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { /* The state manager will wait until the slot table is empty */ - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); dprintk("%s session is draining\n", __func__); - return -EAGAIN; + goto out_sleep; } if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); dprintk("%s enforce FIFO order\n", __func__); - return -EAGAIN; + goto out_sleep; } slot = nfs4_alloc_slot(tbl); @@ -542,10 +538,8 @@ int nfs41_setup_sequence(struct nfs4_session *session, /* If out of memory, try again in 1/4 second */ if (slot == ERR_PTR(-ENOMEM)) task->tk_timeout = HZ >> 2; - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); dprintk("<-- %s: no free slots\n", __func__); - return -EAGAIN; + goto out_sleep; } spin_unlock(&tbl->slot_tbl_lock); @@ -566,6 +560,10 @@ int nfs41_setup_sequence(struct nfs4_session *session, out_success: rpc_call_start(task); return 0; +out_sleep: + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; } EXPORT_SYMBOL_GPL(nfs41_setup_sequence); -- cgit v1.2.1 From 275e7e20aa8599719729f8ef4c09c9bfc4895642 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Nov 2012 17:07:07 -0400 Subject: NFSv4.1: Remove the 'FIFO' behaviour for nfs41_setup_sequence It is more important to preserve the task priority behaviour, which ensures that things like reclaim writes take precedence over background and kupdate writes. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 - fs/nfs/nfs4proc.c | 15 +-------------- fs/nfs/nfs4state.c | 4 +--- 3 files changed, 2 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f0cdc1b7148..4635bf51b3e6 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -236,7 +236,6 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser return server->nfs_client->cl_session; } -extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy); extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 87525eb60bd8..4b1635ce658d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -379,12 +379,6 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp #if defined(CONFIG_NFS_V4_1) -bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - return true; -} - static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_session *session; @@ -412,8 +406,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) if (tbl->highest_used_slotid != NFS4_NO_SLOT) send_new_highest_used_slotid = false; if (!nfs4_session_draining(session)) { - if (rpc_wake_up_first(&tbl->slot_tbl_waitq, - nfs4_set_task_privileged, NULL) != NULL) + if (rpc_wake_up_next(&tbl->slot_tbl_waitq) != NULL) send_new_highest_used_slotid = false; } spin_unlock(&tbl->slot_tbl_lock); @@ -527,12 +520,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, goto out_sleep; } - if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && - !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { - dprintk("%s enforce FIFO order\n", __func__); - goto out_sleep; - } - slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { /* If out of memory, try again in 1/4 second */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c137421f2123..7d73df5a05d1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -263,9 +263,7 @@ static void nfs4_end_drain_session(struct nfs_client *clp) if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&tbl->slot_tbl_lock); for (i = 0; i <= tbl->max_slotid; i++) { - if (rpc_wake_up_first(&tbl->slot_tbl_waitq, - nfs4_set_task_privileged, - NULL) == NULL) + if (rpc_wake_up_next(&tbl->slot_tbl_waitq) == NULL) break; } spin_unlock(&tbl->slot_tbl_lock); -- cgit v1.2.1 From 8fe72bac8de784c4059b41a7dd6bb0151a3ae898 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 19:02:20 -0400 Subject: NFSv4: Clean up handling of privileged operations Privileged rpc calls are those that are run by the state recovery thread, in cases where we're trying to recover the system after a server reboot or a network partition. In those cases, we want to fence off all other rpc calls (see nfs4_begin_drain_session()) so that they don't end up using stateids or clientids that are in the process of being recovered. Prior to this patch, we had to set up special callback functions in order to declare an rpc call as being privileged. By adding a new field to the sequence arguments, this patch simplifies things considerably, and allows us to declare the rpc call as privileged before it is run. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 114 ++++++++++++++++++++---------------------------------- 1 file changed, 42 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b1635ce658d..38a709d78594 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -490,11 +490,17 @@ static void nfs41_init_sequence(struct nfs4_sequence_args *args, { args->sa_slot = NULL; args->sa_cache_this = 0; + args->sa_privileged = 0; if (cache_reply) args->sa_cache_this = 1; res->sr_slot = NULL; } +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ + args->sa_privileged = 1; +} + int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -514,7 +520,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && - !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); goto out_sleep; @@ -548,6 +554,9 @@ out_success: rpc_call_start(task); return 0; out_sleep: + /* Privileged tasks are queued with top priority */ + if (args->sa_privileged) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; @@ -593,12 +602,6 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); } -static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs41_call_sync_prepare(task, calldata); -} - static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; @@ -611,17 +614,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static const struct rpc_call_ops nfs41_call_priv_sync_ops = { - .rpc_call_prepare = nfs41_call_priv_sync_prepare, - .rpc_call_done = nfs41_call_sync_done, -}; - static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int privileged) + struct nfs4_sequence_res *res) { int ret; struct rpc_task *task; @@ -637,8 +634,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - if (privileged) - task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -656,16 +651,21 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0); + return nfs4_call_sync_sequence(clnt, server, msg, args, res); } #else -static inline +static void nfs41_init_sequence(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { } +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ +} + + static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -1475,13 +1475,6 @@ unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; - -} - -static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs4_open_prepare(task, calldata); } static void nfs4_open_done(struct rpc_task *task, void *calldata) @@ -1542,12 +1535,6 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -static const struct rpc_call_ops nfs4_recover_open_ops = { - .rpc_call_prepare = nfs4_recover_open_prepare, - .rpc_call_done = nfs4_open_done, - .rpc_release = nfs4_open_release, -}; - static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { struct inode *dir = data->dir->d_inode; @@ -1577,7 +1564,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->rpc_status = 0; data->cancelled = 0; if (isrecover) - task_setup_data.callback_ops = &nfs4_recover_open_ops; + nfs4_set_sequence_privileged(&o_arg->seq_args); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -4558,8 +4545,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) return; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { - if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; + } data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; @@ -4574,13 +4562,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); - dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); -} - -static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs4_lock_prepare(task, calldata); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } static void nfs4_lock_done(struct rpc_task *task, void *calldata) @@ -4635,12 +4617,6 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; -static const struct rpc_call_ops nfs4_recover_lock_ops = { - .rpc_call_prepare = nfs4_recover_lock_prepare, - .rpc_call_done = nfs4_lock_done, - .rpc_release = nfs4_lock_release, -}; - static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { switch (error) { @@ -4683,15 +4659,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - if (recovery_type > NFS_LOCK_NEW) { - if (recovery_type == NFS_LOCK_RECLAIM) - data->arg.reclaim = NFS_LOCK_RECLAIM; - task_setup_data.callback_ops = &nfs4_recover_lock_ops; - } nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + nfs4_set_sequence_privileged(&data->arg.seq_args); + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -5432,7 +5408,6 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); /* just setup sequence, do not trigger session recovery since we're invoked within one */ nfs41_setup_sequence(data->clp->cl_session, @@ -5500,6 +5475,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) int status; nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); + nfs4_set_sequence_privileged(&args.la_seq_args); dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -5775,26 +5751,15 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) nfs41_setup_sequence(clp->cl_session, args, res, task); } -static void nfs41_sequence_prepare_privileged(struct rpc_task *task, void *data) -{ - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - nfs41_sequence_prepare(task, data); -} - static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_call_done = nfs41_sequence_call_done, .rpc_call_prepare = nfs41_sequence_prepare, .rpc_release = nfs41_sequence_release, }; -static const struct rpc_call_ops nfs41_sequence_privileged_ops = { - .rpc_call_done = nfs41_sequence_call_done, - .rpc_call_prepare = nfs41_sequence_prepare_privileged, - .rpc_release = nfs41_sequence_release, -}; - -static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred, - const struct rpc_call_ops *seq_ops) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + bool is_privileged) { struct nfs4_sequence_data *calldata; struct rpc_message msg = { @@ -5804,7 +5769,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, - .callback_ops = seq_ops, + .callback_ops = &nfs41_sequence_ops, .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, }; @@ -5816,6 +5781,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ return ERR_PTR(-ENOMEM); } nfs41_init_sequence(&calldata->args, &calldata->res, 0); + if (is_privileged) + nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -5831,7 +5798,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return 0; - task = _nfs41_proc_sequence(clp, cred, &nfs41_sequence_ops); + task = _nfs41_proc_sequence(clp, cred, false); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -5845,7 +5812,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred, &nfs41_sequence_privileged_ops); + task = _nfs41_proc_sequence(clp, cred, true); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; @@ -5874,7 +5841,6 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); nfs41_setup_sequence(calldata->clp->cl_session, &calldata->arg.seq_args, &calldata->res.seq_res, @@ -5955,6 +5921,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) calldata->arg.one_fs = 0; nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); + nfs4_set_sequence_privileged(&calldata->arg.seq_args); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -6521,7 +6488,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) dprintk("NFS call test_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(server->client, server, &msg, + &args.seq_args, &res.seq_res); if (status != NFS_OK) { dprintk("NFS reply test_stateid: failed, %d\n", status); return status; @@ -6568,8 +6537,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) dprintk("NFS call free_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); status = nfs4_call_sync_sequence(server->client, server, &msg, - &args.seq_args, &res.seq_res, 1); + &args.seq_args, &res.seq_res); dprintk("NFS reply free_stateid: %d\n", status); return status; } -- cgit v1.2.1 From 104287cd4ebb5484c654551c102c25c94227f717 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Mon, 12 Nov 2012 14:13:13 -0500 Subject: NFS: Remove _nfs_call_sync_session All it does is pass its arguments through to another function. Let's cut out the middleman... Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 38a709d78594..7f8b42781338 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -644,16 +644,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, return ret; } -static -int _nfs4_call_sync_session(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res) -{ - return nfs4_call_sync_sequence(clnt, server, msg, args, res); -} - #else static void nfs41_init_sequence(struct nfs4_sequence_args *args, @@ -6659,7 +6649,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { #if defined(CONFIG_NFS_V4_1) static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, - .call_sync = _nfs4_call_sync_session, + .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, -- cgit v1.2.1 From 1e1093c7fd4951bb4272212c238d09cd7a22f5fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Nov 2012 16:44:05 -0400 Subject: NFSv4.1: Don't mess with task priorities in nfs41_setup_sequence We want to preserve the rpc_task priority for things like writebacks, that may have differing levels of urgency. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7f8b42781338..99d99a5a3f61 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -536,8 +536,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); - rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); - args->sa_slot = slot; dprintk("<-- %s slotid=%d seqid=%d\n", __func__, @@ -556,8 +554,10 @@ out_success: out_sleep: /* Privileged tasks are queued with top priority */ if (args->sa_privileged) - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } -- cgit v1.2.1 From b75ad4cda5a6cd3431b1c65c2739c5ebd2c4b9da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Nov 2012 17:27:47 -0500 Subject: NFSv4.1: Ensure smooth handover of slots from one task to the next waiting Currently, we see a lot of bouncing for the value of highest_used_slotid due to the fact that slots are getting freed, instead of getting instantly transmitted to the next waiting task. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++---- fs/nfs/nfs4session.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++--- fs/nfs/nfs4session.h | 4 ++++ fs/nfs/nfs4state.c | 6 +----- 4 files changed, 69 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 99d99a5a3f61..992233561dbd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -401,14 +401,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) if (tbl->highest_used_slotid > tbl->target_highest_slotid) send_new_highest_used_slotid = true; + if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { + send_new_highest_used_slotid = false; + goto out_unlock; + } nfs4_free_slot(tbl, res->sr_slot); if (tbl->highest_used_slotid != NFS4_NO_SLOT) send_new_highest_used_slotid = false; - if (!nfs4_session_draining(session)) { - if (rpc_wake_up_next(&tbl->slot_tbl_waitq) != NULL) - send_new_highest_used_slotid = false; - } +out_unlock: spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; if (send_new_highest_used_slotid) @@ -1465,6 +1466,7 @@ unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; + nfs4_sequence_done(task, &data->o_res.seq_res); } static void nfs4_open_done(struct rpc_task *task, void *calldata) @@ -2135,6 +2137,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; + nfs4_sequence_done(task, &calldata->res.seq_res); goto out; } @@ -4384,6 +4387,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ task->tk_action = NULL; + nfs4_sequence_done(task, &calldata->res.seq_res); return; } calldata->timestamp = jiffies; diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 701170293ceb..066cfa101b41 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -217,11 +217,65 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) nfs4_shrink_slot_table(&session->bc_slot_table, 0); } +static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) +{ + struct nfs4_sequence_args *args = task->tk_msg.rpc_argp; + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + struct nfs4_slot *slot = pslot; + struct nfs4_slot_table *tbl = slot->table; + + if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + return false; + slot->renewal_time = jiffies; + slot->generation = tbl->generation; + args->sa_slot = slot; + res->sr_slot = slot; + res->sr_status_flags = 0; + res->sr_status = 1; + return true; +} + +static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot)) + return true; + return false; +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (slot->slot_nr > tbl->max_slotid) + return false; + return __nfs41_wake_and_assign_slot(tbl, slot); +} + +static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *slot = nfs4_alloc_slot(tbl); + if (!IS_ERR(slot)) { + bool ret = __nfs41_wake_and_assign_slot(tbl, slot); + if (ret) + return ret; + nfs4_free_slot(tbl, slot); + } + return false; +} + +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) +{ + for (;;) { + if (!nfs41_try_wake_next_slot_table_entry(tbl)) + break; + } +} + /* Update the client's idea of target_highest_slotid */ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, u32 target_highest_slotid) { - unsigned int max_slotid, i; + unsigned int max_slotid; if (tbl->target_highest_slotid == target_highest_slotid) return; @@ -229,9 +283,8 @@ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, tbl->generation++; max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, tbl->target_highest_slotid); - for (i = tbl->max_slotid + 1; i <= max_slotid; i++) - rpc_wake_up_next(&tbl->slot_tbl_waitq); tbl->max_slotid = max_slotid; + nfs41_wake_slot_table(tbl); } void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index bdd14a60722b..7db739370164 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -94,6 +94,10 @@ static inline bool nfs4_session_draining(struct nfs4_session *session) return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); } +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + /* * Determine if sessions are in use. */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7d73df5a05d1..78e90a80fc3a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -255,17 +255,13 @@ static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; struct nfs4_slot_table *tbl; - unsigned int i; if (ses == NULL) return; tbl = &ses->fc_slot_table; if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&tbl->slot_tbl_lock); - for (i = 0; i <= tbl->max_slotid; i++) { - if (rpc_wake_up_next(&tbl->slot_tbl_waitq) == NULL) - break; - } + nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); } } -- cgit v1.2.1 From 1fa8064429d0acbf5bbf3c8a53f65679fdacc75e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 2 Dec 2012 13:54:59 -0500 Subject: NFSv4.1: Try to eliminate outliers when updating target_highest_slotid Look for sudden changes in the first and second derivatives in order to eliminate outlier changes to target_highest_slotid (which are due to out-of-order RPC replies). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4session.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----- fs/nfs/nfs4session.h | 2 ++ 2 files changed, 60 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 066cfa101b41..ed5aa9fa9c7b 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -178,6 +178,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, tbl->highest_used_slotid = NFS4_NO_SLOT; tbl->target_highest_slotid = server_highest_slotid; tbl->server_highest_slotid = server_highest_slotid; + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; tbl->max_slotid = server_highest_slotid; } @@ -292,6 +294,8 @@ void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, { spin_lock(&tbl->slot_tbl_lock); nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; spin_unlock(&tbl->slot_tbl_lock); } @@ -307,16 +311,65 @@ static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, tbl->server_highest_slotid = highest_slotid; } +static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2) +{ + s1 -= s2; + if (s1 == 0) + return 0; + if (s1 < 0) + return (s1 - 1) >> 1; + return (s1 + 1) >> 1; +} + +static int nfs41_sign_s32(s32 s1) +{ + if (s1 > 0) + return 1; + if (s1 < 0) + return -1; + return 0; +} + +static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2) +{ + if (!s1 || !s2) + return true; + return nfs41_sign_s32(s1) == nfs41_sign_s32(s2); +} + +/* Try to eliminate outliers by checking for sharp changes in the + * derivatives and second derivatives + */ +static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl, + u32 new_target) +{ + s32 d_target, d2_target; + bool ret = true; + + d_target = nfs41_derivative_target_slotid(new_target, + tbl->target_highest_slotid); + d2_target = nfs41_derivative_target_slotid(d_target, + tbl->d_target_highest_slotid); + /* Is first derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid)) + ret = false; + /* Is second derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid)) + ret = false; + tbl->d_target_highest_slotid = d_target; + tbl->d2_target_highest_slotid = d2_target; + return ret; +} + void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, struct nfs4_slot *slot, struct nfs4_sequence_res *res) { spin_lock(&tbl->slot_tbl_lock); - if (tbl->generation != slot->generation) - goto out; - nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); - nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); -out: + if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); + if (tbl->generation == slot->generation) + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); spin_unlock(&tbl->slot_tbl_lock); } diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 7db739370164..04f834cab16c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -38,6 +38,8 @@ struct nfs4_slot_table { * op for dynamic resizing */ u32 target_highest_slotid; /* Server max_slot target */ u32 server_highest_slotid; /* Server highest slotid */ + s32 d_target_highest_slotid; /* Derivative */ + s32 d2_target_highest_slotid; /* 2nd derivative */ unsigned long generation; /* Generation counter for target_highest_slotid */ struct completion complete; -- cgit v1.2.1 From 081c0414dcdfd13c4276db30a775a5d0f72ad91a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 27 Nov 2012 18:38:53 +0400 Subject: CIFS: Do not permit write to a range mandatory locked with a read lock We don't need to permit a write to the area locked with a read lock by any process including the process that issues the write. Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 +- fs/cifs/file.c | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 15a8cb66a07b..a152f3645b09 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -186,7 +186,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, - bool rw_check); + int rw_check); extern void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bceffa8c034e..ebebbb2bc1fb 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -759,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) } } +#define CIFS_LOCK_OP 0 +#define CIFS_READ_OP 1 +#define CIFS_WRITE_OP 2 + +/* @rw_check : 0 - no op, 1 - read, 2 - write */ static bool cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, __u64 length, __u8 type, struct cifsFileInfo *cfile, - struct cifsLockInfo **conf_lock, bool rw_check) + struct cifsLockInfo **conf_lock, int rw_check) { struct cifsLockInfo *li; struct cifsFileInfo *cur_cfile = fdlocks->cfile; @@ -772,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, if (offset + length <= li->offset || offset >= li->offset + li->length) continue; - if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && - current->tgid == li->pid) - continue; + if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid && + server->ops->compare_fids(cfile, cur_cfile)) { + /* shared lock prevents write op through the same fid */ + if (!(li->type & server->vals->shared_lock_type) || + rw_check != CIFS_WRITE_OP) + continue; + } if ((type & server->vals->shared_lock_type) && ((server->ops->compare_fids(cfile, cur_cfile) && current->tgid == li->pid) || type == li->type)) @@ -789,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, - bool rw_check) + int rw_check) { bool rc = false; struct cifs_fid_locks *cur; @@ -825,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - &conf_lock, false); + &conf_lock, CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -872,7 +881,7 @@ try_again: down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, - lock->type, &conf_lock, false); + lock->type, &conf_lock, CIFS_LOCK_OP); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -2466,7 +2475,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - true)) { + CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); rc = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); @@ -2901,7 +2910,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), tcon->ses->server->vals->shared_lock_type, - NULL, true)) + NULL, CIFS_READ_OP)) rc = generic_file_aio_read(iocb, iov, nr_segs, pos); up_read(&cinode->lock_sem); return rc; -- cgit v1.2.1 From 03eca704cfa426aebf6edcc0208536835c109a9f Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 6 Dec 2012 21:24:33 +0400 Subject: CIFS: Fix possible data coherency problem after oplock break to None by using cifs_invalidate_mapping rather than invalidate_remote_inode in cifs_oplock_break - this invalidates all inode pages and resets fscache cookies. Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ebebbb2bc1fb..1b322d041f1e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3554,7 +3554,7 @@ void cifs_oplock_break(struct work_struct *work) if (cinode->clientCanCacheRead == 0) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - invalidate_remote_inode(inode); + cifs_invalidate_mapping(inode); } cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } -- cgit v1.2.1 From 684c9aaebbb0ea3a9954d605d4908e650659e7db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 7 Dec 2012 16:48:39 -0800 Subject: vfs: fix O_DIRECT read past end of block device The direct-IO write path already had the i_size checks in mm/filemap.c, but it turns out the read path did not, and removing the block size checks in fs/block_dev.c (commit bbec0270bdd8: "blkdev_max_block: make private to fs/buffer.c") removed the magic "shrink IO to past the end of the device" code there. Fix it by truncating the IO to the size of the block device, like the write path already does. NOTE! I suspect the write path would be *much* better off doing it this way in fs/block_dev.c, rather than hidden deep in mm/filemap.c. The mm/filemap.c code is extremely hard to follow, and has various conditionals on the target being a block device (ie the flag passed in to 'generic_write_checks()', along with a conditional update of the inode timestamp etc). It is also quite possible that we should treat this whole block device size as a "s_maxbytes" issue, and try to make the logic even more generic. However, in the meantime this is the fairly minimal targeted fix. Noted by Milan Broz thanks to a regression test for the cryptsetup reencrypt tool. Reported-and-tested-by: Milan Broz Signed-off-by: Linus Torvalds --- fs/block_dev.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index a1e09b4fe1ba..ab3a456f6650 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1544,6 +1544,22 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL_GPL(blkdev_aio_write); +static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); + + if (pos >= size) + return 0; + + size -= pos; + if (size < INT_MAX) + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); + return generic_file_aio_read(iocb, iov, nr_segs, pos); +} + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1574,7 +1590,7 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = blkdev_fsync, -- cgit v1.2.1 From faa65f07d21e7d37190c91fdcf9f940d733ae3cc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:29 -0500 Subject: cifs: simplify id_to_sid and sid_to_id mapping code The cifs.idmap handling code currently causes the kernel to cache the data from userspace twice. It first looks in a rbtree to see if there is a matching entry for the given id. If there isn't then it calls request_key which then checks its cache and then calls out to userland if it doesn't have one. If the userland program establishes a mapping and downcalls with that info, it then gets cached in the keyring and in this rbtree. Aside from the double memory usage and the performance penalty in doing all of these extra copies, there are some nasty bugs in here too. The code declares four rbtrees and spinlocks to protect them, but only seems to use two of them. The upshot is that the same tree is used to hold (eg) uid:sid and sid:uid mappings. The comparitors aren't equipped to deal with that. I think we'd be best off to remove a layer of caching in this code. If this was originally done for performance reasons, then that really seems like a premature optimization. This patch does that -- it removes the rbtrees and the locks that protect them and simply has the code do a request_key call on each call into sid_to_id and id_to_sid. This greatly simplifies this code and should roughly halve the memory utilization from using the idmapping code. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 535 +++++++++------------------------------------------- fs/cifs/cifsacl.h | 28 +-- fs/cifs/cifsfs.c | 1 - fs/cifs/cifsproto.h | 1 - 4 files changed, 99 insertions(+), 466 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 42b3fe981a0a..f4508ee4e80d 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -44,128 +44,6 @@ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; static const struct cred *root_cred; -static void -shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, - int *nr_del) -{ - struct rb_node *node; - struct rb_node *tmp; - struct cifs_sid_id *psidid; - - node = rb_first(root); - while (node) { - tmp = node; - node = rb_next(tmp); - psidid = rb_entry(tmp, struct cifs_sid_id, rbnode); - if (nr_to_scan == 0 || *nr_del == nr_to_scan) - ++(*nr_rem); - else { - if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE) - && psidid->refcount == 0) { - rb_erase(tmp, root); - ++(*nr_del); - } else - ++(*nr_rem); - } - } -} - -/* - * Run idmap cache shrinker. - */ -static int -cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) -{ - int nr_to_scan = sc->nr_to_scan; - int nr_del = 0; - int nr_rem = 0; - struct rb_root *root; - - root = &uidtree; - spin_lock(&siduidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&siduidlock); - - root = &gidtree; - spin_lock(&sidgidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&sidgidlock); - - root = &siduidtree; - spin_lock(&uidsidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&uidsidlock); - - root = &sidgidtree; - spin_lock(&gidsidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&gidsidlock); - - return nr_rem; -} - -static void -sid_rb_insert(struct rb_root *root, unsigned long cid, - struct cifs_sid_id **psidid, char *typestr) -{ - char *strptr; - struct rb_node *node = root->rb_node; - struct rb_node *parent = NULL; - struct rb_node **linkto = &(root->rb_node); - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - parent = node; - if (cid > lsidid->id) { - linkto = &(node->rb_left); - node = node->rb_left; - } - if (cid < lsidid->id) { - linkto = &(node->rb_right); - node = node->rb_right; - } - } - - (*psidid)->id = cid; - (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); - (*psidid)->refcount = 0; - - sprintf((*psidid)->sidstr, "%s", typestr); - strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); - sprintf(strptr, "%ld", cid); - - clear_bit(SID_ID_PENDING, &(*psidid)->state); - clear_bit(SID_ID_MAPPED, &(*psidid)->state); - - rb_link_node(&(*psidid)->rbnode, parent, linkto); - rb_insert_color(&(*psidid)->rbnode, root); -} - -static struct cifs_sid_id * -sid_rb_search(struct rb_root *root, unsigned long cid) -{ - struct rb_node *node = root->rb_node; - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - if (cid > lsidid->id) - node = node->rb_left; - else if (cid < lsidid->id) - node = node->rb_right; - else /* node found */ - return lsidid; - } - - return NULL; -} - -static struct shrinker cifs_shrinker = { - .shrink = cifs_idmap_shrinker, - .seeks = DEFAULT_SEEKS, -}; - static int cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { @@ -195,30 +73,39 @@ static struct key_type cifs_idmap_key_type = { .match = user_match, }; -static void -sid_to_str(struct cifs_sid *sidptr, char *sidstr) +static char * +sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) { - int i; + int i, len; unsigned int saval; - char *strptr; + char *sidstr, *strptr; - strptr = sidstr; + /* 3 bytes for prefix */ + sidstr = kmalloc(3 + SID_STRING_BASE_SIZE + + (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth), + GFP_KERNEL); + if (!sidstr) + return sidstr; - sprintf(strptr, "S-%hhu", sidptr->revision); - strptr = sidstr + strlen(sidstr); + strptr = sidstr; + len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g', + sidptr->revision); + strptr += len; for (i = 0; i < NUM_AUTHS; ++i) { if (sidptr->authority[i]) { - sprintf(strptr, "-%hhu", sidptr->authority[i]); - strptr = sidstr + strlen(sidstr); + len = sprintf(strptr, "-%hhu", sidptr->authority[i]); + strptr += len; } } for (i = 0; i < sidptr->num_subauth; ++i) { saval = le32_to_cpu(sidptr->sub_auth[i]); - sprintf(strptr, "-%u", saval); - strptr = sidstr + strlen(sidstr); + len = sprintf(strptr, "-%u", saval); + strptr += len; } + + return sidstr; } /* @@ -284,184 +171,38 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) dst->sub_auth[i] = src->sub_auth[i]; } -static void -id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, - struct cifs_sid_id **psidid, char *typestr) -{ - int rc; - char *strptr; - struct rb_node *node = root->rb_node; - struct rb_node *parent = NULL; - struct rb_node **linkto = &(root->rb_node); - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - parent = node; - rc = compare_sids(sidptr, &((lsidid)->sid)); - if (rc > 0) { - linkto = &(node->rb_left); - node = node->rb_left; - } else if (rc < 0) { - linkto = &(node->rb_right); - node = node->rb_right; - } - } - - cifs_copy_sid(&(*psidid)->sid, sidptr); - (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); - (*psidid)->refcount = 0; - - sprintf((*psidid)->sidstr, "%s", typestr); - strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); - sid_to_str(&(*psidid)->sid, strptr); - - clear_bit(SID_ID_PENDING, &(*psidid)->state); - clear_bit(SID_ID_MAPPED, &(*psidid)->state); - - rb_link_node(&(*psidid)->rbnode, parent, linkto); - rb_insert_color(&(*psidid)->rbnode, root); -} - -static struct cifs_sid_id * -id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) -{ - int rc; - struct rb_node *node = root->rb_node; - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - rc = compare_sids(sidptr, &((lsidid)->sid)); - if (rc > 0) { - node = node->rb_left; - } else if (rc < 0) { - node = node->rb_right; - } else /* node found */ - return lsidid; - } - - return NULL; -} - -static int -sidid_pending_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - static int -id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) { - int rc = 0; + int rc; struct key *sidkey; + char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ const struct cred *saved_cred; - struct cifs_sid *lsid; - struct cifs_sid_id *psidid, *npsidid; - struct rb_root *cidtree; - spinlock_t *cidlock; - - if (sidtype == SIDOWNER) { - cidlock = &siduidlock; - cidtree = &uidtree; - } else if (sidtype == SIDGROUP) { - cidlock = &sidgidlock; - cidtree = &gidtree; - } else - return -EINVAL; - - spin_lock(cidlock); - psidid = sid_rb_search(cidtree, cid); - - if (!psidid) { /* node does not exist, allocate one & attempt adding */ - spin_unlock(cidlock); - npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); - if (!npsidid) - return -ENOMEM; - - npsidid->sidstr = kmalloc(SID_STRING_MAX, GFP_KERNEL); - if (!npsidid->sidstr) { - kfree(npsidid); - return -ENOMEM; - } - - spin_lock(cidlock); - psidid = sid_rb_search(cidtree, cid); - if (psidid) { /* node happened to get inserted meanwhile */ - ++psidid->refcount; - spin_unlock(cidlock); - kfree(npsidid->sidstr); - kfree(npsidid); - } else { - psidid = npsidid; - sid_rb_insert(cidtree, cid, &psidid, - sidtype == SIDOWNER ? "oi:" : "gi:"); - ++psidid->refcount; - spin_unlock(cidlock); - } - } else { - ++psidid->refcount; - spin_unlock(cidlock); - } - /* - * If we are here, it is safe to access psidid and its fields - * since a reference was taken earlier while holding the spinlock. - * A reference on the node is put without holding the spinlock - * and it is OK to do so in this case, shrinker will not erase - * this node until all references are put and we do not access - * any fields of the node after a reference is put . - */ - if (test_bit(SID_ID_MAPPED, &psidid->state)) { - cifs_copy_sid(ssid, &psidid->sid); - psidid->time = jiffies; /* update ts for accessing */ - goto id_sid_out; - } + rc = snprintf(desc, sizeof(desc), "%ci:%u", + sidtype == SIDOWNER ? 'o' : 'g', cid); + if (rc >= sizeof(desc)) + return -EINVAL; - if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { + rc = 0; + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, desc, ""); + if (IS_ERR(sidkey)) { rc = -EINVAL; - goto id_sid_out; - } - - if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { - saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); - if (IS_ERR(sidkey)) { - rc = -EINVAL; - cFYI(1, "%s: Can't map and id to a SID", __func__); - } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { - rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key " - "(datalen=%hu)", __func__, sidkey->datalen); - } else { - lsid = (struct cifs_sid *)sidkey->payload.data; - cifs_copy_sid(&psidid->sid, lsid); - cifs_copy_sid(ssid, &psidid->sid); - set_bit(SID_ID_MAPPED, &psidid->state); - key_put(sidkey); - kfree(psidid->sidstr); - } - psidid->time = jiffies; /* update ts for accessing */ - revert_creds(saved_cred); - clear_bit(SID_ID_PENDING, &psidid->state); - wake_up_bit(&psidid->state, SID_ID_PENDING); - } else { - rc = wait_on_bit(&psidid->state, SID_ID_PENDING, - sidid_pending_wait, TASK_INTERRUPTIBLE); - if (rc) { - cFYI(1, "%s: sidid_pending_wait interrupted %d", - __func__, rc); - --psidid->refcount; - return rc; - } - if (test_bit(SID_ID_MAPPED, &psidid->state)) - cifs_copy_sid(ssid, &psidid->sid); - else - rc = -EINVAL; - } -id_sid_out: - --psidid->refcount; + cFYI(1, "%s: Can't map %cid %u to a SID", __func__, + sidtype == SIDOWNER ? 'u' : 'g', cid); + goto out_revert_creds; + } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); + goto out_key_put; + } + cifs_copy_sid(ssid, (struct cifs_sid *)sidkey->payload.data); +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); return rc; } @@ -470,111 +211,66 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { int rc; - unsigned long cid; - struct key *idkey; + struct key *sidkey; + char *sidstr; const struct cred *saved_cred; - struct cifs_sid_id *psidid, *npsidid; - struct rb_root *cidtree; - spinlock_t *cidlock; - - if (sidtype == SIDOWNER) { - cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */ - cidlock = &siduidlock; - cidtree = &uidtree; - } else if (sidtype == SIDGROUP) { - cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */ - cidlock = &sidgidlock; - cidtree = &gidtree; - } else - return -ENOENT; - - spin_lock(cidlock); - psidid = id_rb_search(cidtree, psid); - - if (!psidid) { /* node does not exist, allocate one & attempt adding */ - spin_unlock(cidlock); - npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); - if (!npsidid) - return -ENOMEM; - - npsidid->sidstr = kmalloc(SID_STRING_MAX, GFP_KERNEL); - if (!npsidid->sidstr) { - kfree(npsidid); - return -ENOMEM; - } - - spin_lock(cidlock); - psidid = id_rb_search(cidtree, psid); - if (psidid) { /* node happened to get inserted meanwhile */ - ++psidid->refcount; - spin_unlock(cidlock); - kfree(npsidid->sidstr); - kfree(npsidid); - } else { - psidid = npsidid; - id_rb_insert(cidtree, psid, &psidid, - sidtype == SIDOWNER ? "os:" : "gs:"); - ++psidid->refcount; - spin_unlock(cidlock); - } - } else { - ++psidid->refcount; - spin_unlock(cidlock); - } + uid_t fuid = cifs_sb->mnt_uid; + gid_t fgid = cifs_sb->mnt_gid; /* - * If we are here, it is safe to access psidid and its fields - * since a reference was taken earlier while holding the spinlock. - * A reference on the node is put without holding the spinlock - * and it is OK to do so in this case, shrinker will not erase - * this node until all references are put and we do not access - * any fields of the node after a reference is put . + * If we have too many subauthorities, then something is really wrong. + * Just return an error. */ - if (test_bit(SID_ID_MAPPED, &psidid->state)) { - cid = psidid->id; - psidid->time = jiffies; /* update ts for accessing */ - goto sid_to_id_out; + if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { + cFYI(1, "%s: %u subauthorities is too many!", __func__, + psid->num_subauth); + return -EIO; } - if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) - goto sid_to_id_out; - - if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { - saved_cred = override_creds(root_cred); - idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); - if (IS_ERR(idkey)) - cFYI(1, "%s: Can't map SID to an id", __func__); - else { - cid = *(unsigned long *)idkey->payload.value; - psidid->id = cid; - set_bit(SID_ID_MAPPED, &psidid->state); - key_put(idkey); - kfree(psidid->sidstr); - } - revert_creds(saved_cred); - psidid->time = jiffies; /* update ts for accessing */ - clear_bit(SID_ID_PENDING, &psidid->state); - wake_up_bit(&psidid->state, SID_ID_PENDING); - } else { - rc = wait_on_bit(&psidid->state, SID_ID_PENDING, - sidid_pending_wait, TASK_INTERRUPTIBLE); - if (rc) { - cFYI(1, "%s: sidid_pending_wait interrupted %d", - __func__, rc); - --psidid->refcount; /* decremented without spinlock */ - return rc; - } - if (test_bit(SID_ID_MAPPED, &psidid->state)) - cid = psidid->id; + sidstr = sid_to_key_str(psid, sidtype); + if (!sidstr) + return -ENOMEM; + + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr, + sidtype == SIDOWNER ? 'u' : 'g'); + goto out_revert_creds; + } + + /* + * FIXME: Here we assume that uid_t and gid_t are same size. It's + * probably a safe assumption but might be better to check based on + * sidtype. + */ + if (sidkey->datalen < sizeof(uid_t)) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); + goto out_key_put; } -sid_to_id_out: - --psidid->refcount; /* decremented without spinlock */ if (sidtype == SIDOWNER) - fattr->cf_uid = cid; + fuid = *(uid_t *)sidkey->payload.value; else - fattr->cf_gid = cid; + fgid = *(gid_t *)sidkey->payload.value; +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); + kfree(sidstr); + + /* + * Note that we return 0 here unconditionally. If the mapping + * fails then we just fall back to using the mnt_uid/mnt_gid. + */ + if (sidtype == SIDOWNER) + fattr->cf_uid = fuid; + else + fattr->cf_gid = fgid; return 0; } @@ -621,17 +317,6 @@ init_cifs_idmap(void) cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; root_cred = cred; - spin_lock_init(&siduidlock); - uidtree = RB_ROOT; - spin_lock_init(&sidgidlock); - gidtree = RB_ROOT; - - spin_lock_init(&uidsidlock); - siduidtree = RB_ROOT; - spin_lock_init(&gidsidlock); - sidgidtree = RB_ROOT; - register_shrinker(&cifs_shrinker); - cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); return 0; @@ -648,41 +333,9 @@ exit_cifs_idmap(void) key_revoke(root_cred->thread_keyring); unregister_key_type(&cifs_idmap_key_type); put_cred(root_cred); - unregister_shrinker(&cifs_shrinker); cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); } -void -cifs_destroy_idmaptrees(void) -{ - struct rb_root *root; - struct rb_node *node; - - root = &uidtree; - spin_lock(&siduidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&siduidlock); - - root = &gidtree; - spin_lock(&sidgidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&sidgidlock); - - root = &siduidtree; - spin_lock(&uidsidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&uidsidlock); - - root = &sidgidtree; - spin_lock(&gidsidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&gidsidlock); -} - /* copy ntsd, owner sid, and group sid from a security descriptor to another */ static void copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 249c94f39635..46cd444ea2f2 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -23,7 +23,7 @@ #define _CIFSACL_H -#define NUM_AUTHS 6 /* number of authority fields */ +#define NUM_AUTHS (6) /* number of authority fields */ #define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ #define NUM_WK_SIDS 7 /* number of well known sids */ #define SIDNAMELENGTH 20 /* long enough for the ones we care about */ @@ -51,15 +51,12 @@ * u32: max 10 bytes in decimal * * "S-" + 3 bytes for version field + 4 bytes for each authority field (3 bytes - * per number + 1 for '-') + 11 bytes for each subauthority field (10 bytes * per number + 1 for '-') + NULL terminator. + * + * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') */ -#define SID_STRING_MAX (195) - -#define SID_ID_MAPPED 0 -#define SID_ID_PENDING 1 -#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ -#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ +#define SID_STRING_BASE_SIZE (2 + 3 + (4 * NUM_AUTHS) + 1) +#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ struct cifs_ntsd { __le16 revision; /* revision level */ @@ -94,19 +91,4 @@ struct cifs_ace { struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); -struct cifs_wksid { - struct cifs_sid cifssid; - char sidname[SIDNAMELENGTH]; -} __attribute__((packed)); - -struct cifs_sid_id { - unsigned int refcount; /* increment with spinlock, decrement without */ - unsigned long id; - unsigned long time; - unsigned long state; - char *sidstr; - struct rb_node rbnode; - struct cifs_sid sid; -}; - #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 273b34904d5b..c6e32f22fbd3 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1204,7 +1204,6 @@ exit_cifs(void) unregister_filesystem(&cifs_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL - cifs_destroy_idmaptrees(); exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a152f3645b09..1988c1baa224 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -58,7 +58,6 @@ do { \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); -extern void cifs_destroy_idmaptrees(void); extern char *build_path_from_dentry(struct dentry *); extern char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, -- cgit v1.2.1 From 41a9f1f6b38664fc08431674d87871a57d763be1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:29 -0500 Subject: cifs: avoid extra allocation for small cifs.idmap keys The cifs.idmap keytype always allocates memory to hold the payload from userspace. In the common case where we're translating a SID to a UID or GID, we're allocating memory to hold something that's less than or equal to the size of a pointer. When the payload is the same size as a pointer or smaller, just store it in the payload.value union member instead. That saves us an extra allocation on the sid_to_id upcall. Note that we have to take extra care to check the datalen when we go to dereference the .data pointer in the union, but the callers now check that as a matter of course anyway. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index f4508ee4e80d..751d34bd825c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -49,6 +49,20 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; + /* + * If the payload is less than or equal to the size of a pointer, then + * an allocation here is wasteful. Just copy the data directly to the + * payload.value union member instead. + * + * With this however, you must check the datalen before trying to + * dereference payload.data! + */ + if (prep->datalen <= sizeof(void *)) { + key->payload.value = 0; + memcpy(&key->payload.value, prep->data, prep->datalen); + key->datalen = prep->datalen; + return 0; + } payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; @@ -62,7 +76,8 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) static inline void cifs_idmap_key_destroy(struct key *key) { - kfree(key->payload.data); + if (key->datalen > sizeof(void *)) + kfree(key->payload.data); } static struct key_type cifs_idmap_key_type = { @@ -245,7 +260,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, * probably a safe assumption but might be better to check based on * sidtype. */ - if (sidkey->datalen < sizeof(uid_t)) { + if (sidkey->datalen != sizeof(uid_t)) { rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " "(datalen=%hu)", __func__, sidkey->datalen); @@ -253,9 +268,9 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, } if (sidtype == SIDOWNER) - fuid = *(uid_t *)sidkey->payload.value; + memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t)); else - fgid = *(gid_t *)sidkey->payload.value; + memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t)); out_key_put: key_put(sidkey); -- cgit v1.2.1 From 2ae03025d520de581fd1c58e98bbf3045c0f4695 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:30 -0500 Subject: cifs: extra sanity checking for cifs.idmap keys Now that we aren't so rigid about the length of the key being passed in, we need to be a bit more rigorous about checking the length of the actual data against the claimed length (a'la num_subauths field). Check for the case where userspace sends us a seemingly valid key with a num_subauths field that goes beyond the end of the array. If that happens, return -EIO and invalidate the key. Also change the other places where we check for malformed keys in this code to invalidate the key as well. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 751d34bd825c..b0b114acdece 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -191,6 +191,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) { int rc; struct key *sidkey; + struct cifs_sid *ksid; + unsigned int ksid_size; char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ const struct cred *saved_cred; @@ -211,14 +213,27 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " "(datalen=%hu)", __func__, sidkey->datalen); - goto out_key_put; + goto invalidate_key; } - cifs_copy_sid(ssid, (struct cifs_sid *)sidkey->payload.data); + + ksid = (struct cifs_sid *)sidkey->payload.data; + ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); + if (ksid_size > sidkey->datalen) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, " + "ksid_size=%u)", __func__, sidkey->datalen, ksid_size); + goto invalidate_key; + } + cifs_copy_sid(ssid, ksid); out_key_put: key_put(sidkey); out_revert_creds: revert_creds(saved_cred); return rc; + +invalidate_key: + key_invalidate(sidkey); + goto out_key_put; } static int @@ -264,6 +279,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " "(datalen=%hu)", __func__, sidkey->datalen); + key_invalidate(sidkey); goto out_key_put; } -- cgit v1.2.1 From 7ee0b4c635c091eb3c805977ba886bae2fd33f0c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:31 -0500 Subject: cifs: fix hardcoded default security descriptor length It was hardcoded to 192 bytes, which was not enough when the max number of subauthorities went to 15. Redefine this constant in terms of sizeof the structs involved, and rename it for better clarity. While we're at it, remove a couple more unused constants from cifsacl.h. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 2 +- fs/cifs/cifsacl.h | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index b0b114acdece..08b4d5022686 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1008,7 +1008,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, * memory for the smb header, set security descriptor request security * descriptor parameters, and secuirty descriptor itself */ - secdesclen = max_t(u32, secdesclen, DEFSECDESCLEN); + secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { cERROR(1, "Unable to allocate security descriptor"); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 46cd444ea2f2..a445405f80d0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -25,9 +25,6 @@ #define NUM_AUTHS (6) /* number of authority fields */ #define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ -#define NUM_WK_SIDS 7 /* number of well known sids */ -#define SIDNAMELENGTH 20 /* long enough for the ones we care about */ -#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */ #define READ_BIT 0x4 #define WRITE_BIT 0x2 @@ -42,6 +39,14 @@ #define SIDOWNER 1 #define SIDGROUP 2 +/* + * Security Descriptor length containing DACL with 3 ACEs (one each for + * owner, group and world). + */ +#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \ + sizeof(struct cifs_acl) + \ + (sizeof(struct cifs_ace) * 3)) + /* * Maximum size of a string representation of a SID: * -- cgit v1.2.1 From 1f6306806c1494bea51b93f96e105e93a96e3c22 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Dec 2012 06:05:31 -0500 Subject: cifs: deal with id_to_sid embedded sid reply corner case A SID could potentially be embedded inside of payload.value if there are no subauthorities, and the arch has 8 byte pointers. Allow for that possibility there. While we're at it, rephrase the "embedding" check in terms of key->payload to allow for the possibility that the union might change size in the future. Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 08b4d5022686..8dd9212ffef5 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -57,7 +57,7 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) * With this however, you must check the datalen before trying to * dereference payload.data! */ - if (prep->datalen <= sizeof(void *)) { + if (prep->datalen <= sizeof(key->payload)) { key->payload.value = 0; memcpy(&key->payload.value, prep->data, prep->datalen); key->datalen = prep->datalen; @@ -76,7 +76,7 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) static inline void cifs_idmap_key_destroy(struct key *key) { - if (key->datalen > sizeof(void *)) + if (key->datalen > sizeof(key->payload)) kfree(key->payload.data); } @@ -216,7 +216,15 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) goto invalidate_key; } - ksid = (struct cifs_sid *)sidkey->payload.data; + /* + * A sid is usually too large to be embedded in payload.value, but if + * there are no subauthorities and the host has 8-byte pointers, then + * it could be. + */ + ksid = sidkey->datalen <= sizeof(sidkey->payload) ? + (struct cifs_sid *)&sidkey->payload.value : + (struct cifs_sid *)sidkey->payload.data; + ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { rc = -EIO; @@ -224,6 +232,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) "ksid_size=%u)", __func__, sidkey->datalen, ksid_size); goto invalidate_key; } + cifs_copy_sid(ssid, ksid); out_key_put: key_put(sidkey); -- cgit v1.2.1 From 38107d45cf452761a74fe512190e23f36834d6dd Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 8 Dec 2012 22:08:06 -0600 Subject: Do not send SMB2 signatures for SMB3 frames Restructure code to make SMB2 vs. SMB3 signing a protocol specific op. SMB3 signing (AES_CMAC) is not enabled yet, but this restructuring at least makes sure we don't send an smb2 signature on an smb3 signed connection. A followon patch will add AES_CMAC and enable smb3 signing. Signed-off-by: Steve French Acked-by: Jeff Layton --- fs/cifs/cifsglob.h | 4 ++- fs/cifs/connect.c | 2 +- fs/cifs/smb2ops.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2proto.h | 4 +++ fs/cifs/smb2transport.c | 13 +++++++--- 5 files changed, 86 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 74a07b604ffd..dfab450a191e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -367,6 +367,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *fid); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *fid); + int (*calc_signature)(struct smb_rqst *rqst, + struct TCP_Server_Info *server); }; struct smb_version_values { @@ -1489,6 +1491,6 @@ extern struct smb_version_values smb20_values; extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; #define SMB30_VERSION_STRING "3.0" -/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ +extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 290c13442f75..f3276239e075 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1085,7 +1085,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->vals = &smb21_values; break; case Smb_30: - vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->ops = &smb30_operations; vol->vals = &smb30_values; break; #endif diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ad4d96a4bff5..d79de7bc4435 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -623,6 +623,74 @@ struct smb_version_operations smb21_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, +}; + + +struct smb_version_operations smb30_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb3_calc_signature, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 7d25f8b14f93..2aa3535e38ce 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern int smb2_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); +extern int smb3_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2a5fdf26f79f..8dd73e61d762 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -39,7 +39,7 @@ #include "smb2status.h" #include "smb2glob.h" -static int +int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int i, rc; @@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } +int +smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + cFYI(1, "smb3 signatures not supported yet"); + return -EOPNOTSUPP; +} + /* must be called with server->srv_mutex held */ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } - rc = smb2_calc_signature(rqst, server); + rc = server->ops->calc_signature(rqst, server); return rc; } @@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); mutex_lock(&server->srv_mutex); - rc = smb2_calc_signature(rqst, server); + rc = server->ops->calc_signature(rqst, server); mutex_unlock(&server->srv_mutex); if (rc) -- cgit v1.2.1 From 6d8b59d712e95d257ee16f80b579677e5e1bf33c Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 8 Dec 2012 22:36:29 -0600 Subject: fix "disabling echoes and oplocks" on SMB2 mounts SMB2 and later will return only 1 credit for session setup (phase 1) not just for the negotiate protocol response. Do not disable echoes and oplocks on session setup (we only need one credit for tree connection anyway) as a resonse with only 1 credit on phase 1 of sessionsetup is expected. Fixes the "CIFS VFS: disabling echoes and oplocks" message logged to dmesg. Signed-off-by: Steve French Acked-by: Jeff Layton --- fs/cifs/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e7f9dbc33ce2..41d9d0725f0f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate: /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR); + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, + CIFS_LOG_ERROR | CIFS_NEG_OP); kfree(security_blob); rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; -- cgit v1.2.1 From 67cf5b09a46f72e048501b84996f2f77bc42e947 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:04:46 -0500 Subject: ext4: add the basic function for inline data support Implement inline data with xattr. Now we use "system.data" to store xattr, and the xattr will be extended if the i_size is increased while we don't release the space during truncate. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/ext4.h | 10 +- fs/ext4/inline.c | 466 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 5 +- fs/ext4/xattr.h | 54 +++++++ 5 files changed, 534 insertions(+), 3 deletions(-) create mode 100644 fs/ext4/inline.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 41f22be2ffa4..3d96d5698538 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -9,6 +9,6 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o inline.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2e9ffa9100bb..c827e47d556c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -402,6 +402,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -458,6 +459,7 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -504,6 +506,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -918,6 +921,10 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -1376,6 +1383,7 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1497,7 +1505,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c new file mode 100644 index 000000000000..bec68b364832 --- /dev/null +++ b/fs/ext4/inline.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) + +int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + int free, min_offs; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_block && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += le32_to_cpu(entry->e_value_size); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? + len : EXT4_MIN_INLINE_DATA_SIZE; + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_set. That saves + * us one memcpy. + */ +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = (void *)empty_zero_page; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(!is.s.not_found); + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(is.s.not_found); + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) + goto out; + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error == -ENODATA) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + /* Update the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int ret, size; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + down_write(&EXT4_I(inode)->xattr_sem); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret; + + down_write(&EXT4_I(inode)->xattr_sem); + ret = ext4_destroy_inline_data_nolock(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index befa005711a1..e23f114e2cfe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3706,8 +3706,10 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_find_inline_data_nolock(inode); + } } struct inode *ext4_iget(struct super_block *sb, unsigned long ino) @@ -3780,6 +3782,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 40ca7a6f5eec..7ae0d05156e3 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -21,6 +21,7 @@ #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -125,6 +126,19 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -201,6 +215,46 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, return -EOPNOTSUPP; } +static inline int ext4_find_inline_data_nolock(struct inode *inode) +{ + return 0; +} + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return 0; +} + +static inline int ext4_get_inline_size(struct inode *inode) +{ + return 0; +} + +static inline int ext4_get_max_inline_size(struct inode *inode) +{ + return 0; +} + +static inline void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len) +{ + return; +} + +static inline int ext4_init_inline_data(handle_t *handle, + struct inode *inode, + unsigned int len) +{ + return 0; +} + +static inline int ext4_destroy_inline_data(handle_t *handle, + struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 46c7f254543dedcf134ad05091ed2b935a9a597d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:04:52 -0500 Subject: ext4: add read support for inline data Let readpage and readpages handle the case when we want to read an inlined file. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 31 +++++++++++++++++++++++++++- fs/ext4/xattr.h | 7 +++++++ 3 files changed, 98 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bec68b364832..e4a41d5d06db 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -454,6 +454,67 @@ out: return error; } +static int ext4_read_inline_page(struct inode *inode, struct page *page) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!PageLocked(page)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(page->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + kaddr = kmap_atomic(page); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + flush_dcache_page(page); + kunmap_atomic(kaddr); + zero_user_segment(page, len, PAGE_CACHE_SIZE); + SetPageUptodate(page); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!page->index) + ret = ext4_read_inline_page(inode, page); + else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + unlock_page(page); + return ret >= 0 ? 0 : ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e23f114e2cfe..1668abf80549 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -649,6 +649,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, int ret = 0, started = 0; int dio_credits; + if (ext4_has_inline_data(inode)) + return -ERANGE; + map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; @@ -2687,6 +2690,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + return 0; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* @@ -2732,14 +2741,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + int ret = -EAGAIN; + struct inode *inode = page->mapping->host; + trace_ext4_readpage(page); - return mpage_readpage(page, ext4_get_block); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, page); + + if (ret == -EAGAIN) + return mpage_readpage(page, ext4_get_block); + + return ret; } static int ext4_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct inode *inode = mapping->host; + + /* If the file has inline data, no need to do readpages. */ + if (ext4_has_inline_data(inode)) + return 0; + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } @@ -3078,6 +3103,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_should_journal_data(inode)) return 0; + /* Let buffer I/O handle the inline data case. */ + if (ext4_has_inline_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7ae0d05156e3..646c9b9be8ed 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -139,6 +139,8 @@ extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -255,6 +257,11 @@ static inline int ext4_destroy_inline_data(handle_t *handle, { return 0; } + +static inline int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From f19d5870cbf72d4cb2a8e1f749dff97af99b071e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:51 -0500 Subject: ext4: add normal write support for inline data For a normal write case (not journalled write, not delayed allocation), we write to the inline if the file is small and convert it to an extent based file when the write is larger than the max inline size. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 11 +++ fs/ext4/extents.c | 9 ++- fs/ext4/inline.c | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 103 ++++++++++++++---------- fs/ext4/xattr.h | 26 ++++++ 5 files changed, 340 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c827e47d556c..9f4efc6c37ba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2018,8 +2018,19 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1dc19a7b449f..f2659f51b23d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -42,6 +42,7 @@ #include #include "ext4_jbd2.h" #include "ext4_extents.h" +#include "xattr.h" #include @@ -2310,7 +2311,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; - int depth = ext_depth(inode); + int depth; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; + + depth = ext_depth(inode); if (chunk) index = depth * 2; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e4a41d5d06db..320ff6fe5d8c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -14,6 +14,7 @@ #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" +#include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -515,6 +516,238 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return ret >= 0 ? 0 : ret; } +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags) +{ + int ret, needed_blocks; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct page *page = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, from, to, ext4_get_block_write); + else + ret = __block_write_begin(page, from, to, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + ext4_orphan_add(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + block_commit_write(page, from, to); +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (sem_held) + up_write(&EXT4_I(inode)->xattr_sem); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + int ret; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + if (pos + len > ext4_get_max_inline_size(inode)) + goto convert; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + /* + * The possible write could happen in the inode, + * so try to reserve the space in inode first. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + + /* We don't have space in inline inode, so convert it to extent. */ + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + brelse(iloc.bh); + goto convert; + } + + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + *pagep = page; + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + unlock_page(page); + page_cache_release(page); + goto out_up_read; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_up_read; + } + + ret = 1; + handle = NULL; +out_up_read: + up_read(&EXT4_I(inode)->xattr_sem); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +convert: + return ext4_convert_inline_data_to_extent(mapping, + inode, flags); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + if (unlikely(copied < len)) { + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + copied = 0; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + up_write(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); +out: + return copied; +} + + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1668abf80549..70c8d5f323f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -770,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -826,8 +826,8 @@ static int walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; @@ -850,8 +850,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -876,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + flags, pagep); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { @@ -893,6 +902,7 @@ retry: ret = -ENOMEM; goto out; } + *pagep = page; if (ext4_should_dioread_nolock(inode)) @@ -901,8 +911,9 @@ retry: ret = __block_write_begin(page, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); } if (ret) { @@ -957,7 +968,12 @@ static int ext4_generic_write_end(struct file *file, struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else + copied = block_write_end(file, mapping, pos, + len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size @@ -1114,8 +1130,8 @@ static int ext4_journalled_write_end(struct file *file, page_zero_new_buffers(page, from+copied, to); } - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); new_i_size = pos + copied; @@ -1903,7 +1919,7 @@ static int __ext4_journalled_writepage(struct page *page, ClearPageChecked(page); page_bufs = page_buffers(page); BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1916,11 +1932,11 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1928,7 +1944,7 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; @@ -2007,8 +2023,8 @@ static int ext4_writepage(struct page *page, commit_write = 1; } page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do @@ -2831,7 +2847,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. */ -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", @@ -3738,7 +3754,8 @@ static inline void ext4_iget_extra_inode(struct inode *inode, if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); - } + } else + EXT4_I(inode)->i_inline_off = 0; } struct inode *ext4_iget(struct super_block *sb, unsigned long ino) @@ -3907,17 +3924,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); + } else if (!ext4_has_inline_data(inode)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } } if (ret) goto bad_inode; @@ -4104,9 +4123,10 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else + } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; + } raw_inode->i_disk_version = cpu_to_le32(inode->i_version); if (ei->i_extra_isize) { @@ -4793,8 +4813,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) { + if (!ext4_walk_page_buffers(NULL, page_buffers(page), + 0, len, NULL, + ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_on_page_writeback(page); ret = VM_FAULT_LOCKED; @@ -4815,7 +4836,7 @@ retry_alloc: } ret = __block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { - if (walk_page_buffers(handle, page_buffers(page), 0, + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 646c9b9be8ed..db5672206238 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -141,6 +141,15 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -262,6 +271,23 @@ static inline int ext4_readpage_inline(struct inode *inode, struct page *page) { return 0; } + +static inline int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + return 0; +} + +static inline int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 3fdcfb668fd78ec92d9bc2daddf1d41e2a8a30bb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: add journalled write support for inline data Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 24 ++++++++++++++++++++ fs/ext4/inode.c | 69 ++++++++++++++++++++++++++++++++++++++++---------------- fs/ext4/xattr.h | 12 ++++++++++ 3 files changed, 85 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 320ff6fe5d8c..01274b1e7d40 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -747,6 +747,30 @@ out: return copied; } +struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + return NULL; + } + + down_write(&EXT4_I(inode)->xattr_sem); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); + kunmap_atomic(kaddr); + up_write(&EXT4_I(inode)->xattr_sem); + + return iloc.bh; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 70c8d5f323f0..5c91622cfe01 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1124,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else { + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + } new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); @@ -1911,15 +1916,29 @@ static int __ext4_journalled_writepage(struct page *page, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; - int ret = 0; - int err; + int ret = 0, err = 0; + int inline_data = ext4_has_inline_data(inode); + struct buffer_head *inode_bh = NULL; ClearPageChecked(page); - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + + if (inline_data) { + BUG_ON(page->index != 0); + BUG_ON(len > ext4_get_max_inline_size(inode)); + inode_bh = ext4_journalled_write_inline_data(inode, len, page); + if (inode_bh == NULL) + goto out; + } else { + page_bufs = page_buffers(page); + if (!page_bufs) { + BUG(); + goto out; + } + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bget_one); + } /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1932,11 +1951,18 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + if (inline_data) { + ret = ext4_journal_get_write_access(handle, inode_bh); - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); + + } else { + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + } if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1944,9 +1970,12 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + if (!ext4_has_inline_data(inode)) + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + brelse(inode_bh); return ret; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index db5672206238..7095ac13fbc2 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -150,6 +150,10 @@ extern int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -288,6 +292,14 @@ static inline int ext4_write_inline_data_end(struct inode *inode, { return 0; } + +static inline struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 9c3569b50f12e47cc5e907b5e37e4a45c0c10b43 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: add delalloc support for inline data For delayed allocation mode, we write to inline data if the file is small enough. And in case of we write to some offset larger than the inline size, the 1st page is dirtied, so that ext4_da_writepages can handle the conversion. When the 1st page is initialized with blocks, the inline part is removed. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++ fs/ext4/inline.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 63 +++++++++++++++++--- fs/ext4/xattr.h | 27 +++++++++ 4 files changed, 262 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f4efc6c37ba..268636af7f5c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2022,6 +2022,8 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, @@ -2031,6 +2033,8 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 01274b1e7d40..65f7ffb5437f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -771,6 +771,183 @@ ext4_journalled_write_inline_data(struct inode *inode, return iloc.bh; } +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metatdata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags, + void **fsdata) +{ + int ret = 0, inline_size; + struct page *page; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = __block_write_begin(page, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + ext4_truncate_failed_write(inode); + goto out; + } + + SetPageDirty(page); + SetPageUptodate(page); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (page) { + unlock_page(page); + page_cache_release(page); + } + return ret; +} + +/* + * Prepare the write for the inline data. + * If the the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret, inline_size; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + inline_size = ext4_get_max_inline_size(inode); + + ret = -ENOSPC; + if (inline_size >= pos + len) { + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + } + + if (ret == -ENOSPC) { + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); + goto out; + } + + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_page; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_release_page; + } + + up_read(&EXT4_I(inode)->xattr_sem); + *pagep = page; + handle = NULL; + brelse(iloc.bh); + return 1; +out_release_page: + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + int i_size_changed = 0; + + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c91622cfe01..f16ae02599cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1790,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + if (ext4_has_inline_data(inode)) { + /* + * We will soon create blocks for this page, and let + * us pretend as if the blocks aren't allocated yet. + * In case of clusters, we have to handle the work + * of mapping from cluster so that the reserved space + * is calculated properly. + */ + if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + retval = 0; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); @@ -1841,8 +1853,8 @@ out_unlock: * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; @@ -2119,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * mpage_da_map_and_submit to map a single contiguous memory region * and then write them. */ -static int write_cache_pages_da(struct address_space *mapping, +static int write_cache_pages_da(handle_t *handle, + struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) @@ -2198,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + } + if (mpd->next_page != page->index) mpd->first_page = page->index; mpd->next_page = page->index + 1; @@ -2404,7 +2428,8 @@ retry: * contiguous region of logical blocks that need * blocks to be allocated by ext4 and submit them. */ - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); + ret = write_cache_pages_da(handle, mapping, + wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -2468,7 +2493,6 @@ out_writepages: return ret; } -#define FALL_BACK_TO_NONDELALLOC 1 static int ext4_nonda_switch(struct super_block *sb) { s64 free_blocks, dirty_blocks; @@ -2525,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_da_write_inline_data_begin(mapping, inode, + pos, len, flags, + pagep, fsdata); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2626,10 +2663,10 @@ static int ext4_da_write_end(struct file *file, * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ - new_i_size = pos + copied; if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { + if (ext4_has_inline_data(inode) || + ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; @@ -2641,8 +2678,16 @@ static int ext4_da_write_end(struct file *file, ext4_mark_inode_dirty(handle, inode); } } - ret2 = generic_write_end(file, mapping, pos, len, copied, + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + page); + else + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = ret2; if (ret2 < 0) ret = ret2; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7095ac13fbc2..37e66f867645 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -154,6 +154,15 @@ extern struct buffer_head * ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -300,6 +309,24 @@ ext4_journalled_write_inline_data(struct inode *inode, { return NULL; } + +static inline int +ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + return 0; +} + +static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From a774f9c20e08643fc0e6c48b0419ad7657ed0c04 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: make ext4_init_dot_dotdot for inline dir usage Currently, the initialization of dot and dotdot are encapsulated in ext4_mkdir and also bond with dir_block. So create a new function named ext4_init_new_dir and the initialization is moved to ext4_init_dot_dotdot. Now it will called either in the normal non-inline case(rec_len of ".." will cover the whole block) or when we converting an inline dir to a block(rec len of ".." will be the real length). The start of the next entry is also returned for inline dir usage. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++ fs/ext4/namei.c | 115 ++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 75 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 268636af7f5c..cf840146ce81 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2415,6 +2415,10 @@ extern void ext4_unwritten_wait(struct inode *inode); extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 88e9a2c7e328..edb9f10c1455 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2230,21 +2230,87 @@ retry: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) +{ + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(de->name_len), blocksize); + strcpy(de->name, ".."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); +} + +static int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) { - handle_t *handle; - struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err, retries = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + goto out; + } + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out; + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); + set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + brelse(dir_block); + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2268,47 +2334,9 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out_clear_inode; - } - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); + err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; - de = (struct ext4_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); - strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), - blocksize); - de->name_len = 2; - strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - - if (csum_size) { - t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); - initialize_dirent_tail(t, blocksize); - } - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); - if (err) - goto out_clear_inode; - set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2328,7 +2356,6 @@ out_clear_inode: unlock_new_inode(inode); d_instantiate(dentry, inode); out_stop: - brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; -- cgit v1.2.1 From 226ba972b0863783ad377f741f6ff0538f31ab00 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: refactor __ext4_check_dir_entry() to accept start and size The __ext4_check_dir_entry() function() is used to check whether the de is over the block boundary. Now with inline data, it could be within the block boundary while exceeds the inode size. So check this function to check the overflow more precisely. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 16 ++++++++-------- fs/ext4/ext4.h | 7 ++++--- fs/ext4/namei.c | 13 +++++++++---- 3 files changed, 21 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 8e07d2a5a139..7c9d08b0f2fe 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,7 +72,7 @@ static int is_dx_dir(struct inode *inode) int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, - struct buffer_head *bh, + struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; @@ -85,9 +85,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - bh->b_data) + rlen > - dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -98,14 +97,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -221,8 +220,9 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, filp, de, bh, + bh->b_data, bh->b_size, + offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cf840146ce81..59cbf498fd5f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1960,10 +1960,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index edb9f10c1455..10da2d50a5d8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -892,6 +892,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, (block<i_sb)) + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -1130,7 +1131,8 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -1; *res_dir = de; return 1; @@ -1643,7 +1645,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + (blocksize - csum_size) - reclen; while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -2076,7 +2079,8 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size - csum_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -2439,7 +2443,8 @@ static int empty_dir(struct inode *inode) set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; -- cgit v1.2.1 From 978fef914a2e6b8ad5672d0a39f9201b7aa7c396 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: create __ext4_insert_dentry for dir entry insertion The old add_dirent_to_buf handles all the work related to the work of adding dir entry to a dir block. Now we have inline data, so create 2 new function __ext4_find_dest_de and __ext4_insert_dentry that do the real work and let add_dirent_to_buf call them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 ++++++++ fs/ext4/namei.c | 105 +++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 80 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 59cbf498fd5f..8e9e94cf1bca 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1969,6 +1969,21 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 10da2d50a5d8..bb9259d20b55 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1084,13 +1084,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -static void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} - /* * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. * @@ -1614,6 +1607,63 @@ errout: return NULL; } +int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + + *dest_de = de; + return 0; +} + +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen) +{ + + int nlen, rlen; + + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = namelen; + memcpy(de->name, name, namelen); +} /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1629,12 +1679,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned int offset = 0; unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; - int nlen, rlen, err; - char *top; int csum_size = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -1642,23 +1690,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + (blocksize - csum_size) - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; + err = ext4_find_dest_de(dir, inode, + bh, bh->b_data, blocksize - csum_size, + name, namelen, &de); + if (err) + return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1668,19 +1704,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } /* By now the buffer is marked for journaling */ - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend -- cgit v1.2.1 From 3c47d54170b6a678875566b1b8d6dcf57904e49b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let add_dir_entry handle inline data properly This patch let add_dir_entry handle the inline data case. So the dir is initialized as inline dir first and then we can try to add some files to it, when the inline space can't hold all the entries, a dir block will be created and the dir entry will be moved to it. Also for an inlined dir, "." and ".." are removed and we only use 4 bytes to store the parent inode number. These 2 entries will be added when we convert an inline dir to a block-based one. [ Folded in patch from Dan Carpenter to remove an unused variable. ] Signed-off-by: Tao Ma Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ++ fs/ext4/inline.c | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 34 +++-- fs/ext4/xattr.h | 19 +++ 4 files changed, 430 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e9e94cf1bca..689ce1d696b8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1616,6 +1616,11 @@ struct ext4_dir_entry_tail { __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -2435,6 +2440,11 @@ extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 65f7ffb5437f..bf7322818738 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -18,6 +18,7 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -949,6 +950,382 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, return copied; } +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct dentry *dentry, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned short reclen; + int err; + struct ext4_dir_entry_2 *de; + + reclen = EXT4_DIR_REC_LEN(namelen); + err = ext4_find_dest_de(dir, inode, iloc->bh, + inline_start, inline_size, + name, namelen, &de); + if (err) + return err; + + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; + ext4_insert_dentry(inode, de, inline_size, name, namelen); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = (struct ext4_dir_entry_2 *)de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = (struct ext4_dir_entry_2 *)de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + ext4_create_inline_data(handle, inode, inline_size); + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_finish_convert_inline_dir(handle_t *handle, + struct inode *inode, + struct buffer_head *dir_block, + void *buf, + int inline_size) +{ + int err, csum_size = 0, header_size = 0; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + void *target = dir_block->b_data; + + /* + * First create "." and ".." and then copy the dir information + * back to the block. + */ + de = (struct ext4_dir_entry_2 *)target; + de = ext4_init_dot_dotdot(inode, de, + inode->i_sb->s_blocksize, csum_size, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); + header_size = (void *)de - target; + + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + ext4_update_final_de(dir_block->b_data, + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, + inode->i_sb->s_blocksize - csum_size); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, + inode->i_sb->s_blocksize); + initialize_dirent_tail(t, inode->i_sb->s_blocksize); + } + set_buffer_uptodate(dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + return err; +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -EIO; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, data_bh); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, + buf, inline_size); + } + + unlock_buffer(data_bh); +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + int ret, inline_size; + void *inline_start; + struct ext4_iloc iloc; + struct inode *dir = dentry->d_parent->d_inode; + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_mark_inode_dirty(handle, dir); + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bb9259d20b55..3cde36bd8020 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* checksumming functions */ -#define EXT4_DIRENT_TAIL(block, blocksize) \ - ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ - ((blocksize) - \ - sizeof(struct ext4_dir_entry_tail)))) - -static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, - unsigned int blocksize) +void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) { memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( @@ -307,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode, (void *)t - (void *)dirent); } -static inline int ext4_handle_dirty_dirent_node(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) { ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); @@ -1878,6 +1873,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, dentry, inode); + if (retval < 0) + return retval; + if (retval == 1) { + retval = 0; + return retval; + } + } + if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) @@ -2301,6 +2307,14 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 37e66f867645..397ef4bbaf1e 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -163,6 +163,11 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -327,6 +332,20 @@ static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, { return 0; } + +static inline int ext4_try_add_inline_entry(handle_t *handle, + struct dentry *dentry, + struct inode *inode) +{ + return 0; +} + +static inline int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 65d165d9366dbf783d0102177006d47c8859ba31 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let ext4_readdir handle inline data For "." and "..", we just call filldir by ourselves instead of iterating the real dir entry. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 25 +++++----- fs/ext4/ext4.h | 12 +++++ fs/ext4/inline.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 9 ++++ 4 files changed, 169 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 7c9d08b0f2fe..b8d877f6c1fa 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -27,23 +27,11 @@ #include #include #include "ext4.h" - -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; +#include "xattr.h" static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; - - return (ext4_filetype_table[filetype]); -} - /** * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which chould potentially get coverted to use htree @@ -68,6 +56,9 @@ static int is_dx_dir(struct inode *inode) * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, @@ -124,6 +115,14 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 689ce1d696b8..e3a74658c63c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1989,6 +1989,18 @@ static inline void ext4_update_dx_flag(struct inode *inode) EXT4_FEATURE_COMPAT_DIR_INDEX)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bf7322818738..471504133c76 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1288,6 +1288,142 @@ out: return ret; } +int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data) +{ + int error = 0; + unsigned int offset, parent_ino; + int i, stored; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + sb = inode->i_sb; + stored = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + + while (!error && !stored && filp->f_pos < inode->i_size) { +revalidate: + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (filp->f_version != inode->i_version) { + for (i = 0; + i < inode->i_size && i < offset;) { + if (!i) { + /* skip "." and ".." if needed. */ + i += EXT4_INLINE_DOTDOT_SIZE; + continue; + } + de = (struct ext4_dir_entry_2 *) + (dir_buf + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + inline_size) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = i; + filp->f_pos = offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size) { + if (filp->f_pos == 0) { + error = filldir(dirent, ".", 1, 0, inode->i_ino, + DT_DIR); + if (error) + break; + stored++; + + error = filldir(dirent, "..", 2, 0, parent_ino, + DT_DIR); + if (error) + break; + stored++; + + filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + continue; + } + + de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + if (ext4_check_dir_entry(inode, filp, de, + iloc.bh, dir_buf, + inline_size, offset)) { + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, + inline_size); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = 0; + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 397ef4bbaf1e..539e6a08c95f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -168,6 +168,9 @@ extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -346,6 +349,12 @@ static inline int ext4_try_create_inline_dir(handle_t *handle, { return 0; } +static inline int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 7335cd3b41b1e704608ca46159641ca9cb598121 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: create a new function search_dir search_dirblock is used to search a dir block, but the code is almost the same for searching an inline dir. So create a new fuction search_dir and let search_dirblock call it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/namei.c | 26 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e3a74658c63c..a971b65bf5ca 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2122,6 +2122,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); /* resize.c */ extern int ext4_group_add(struct super_block *sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3cde36bd8020..d50684b91496 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1005,6 +1005,16 @@ errout: return (err); } +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + d_name, offset, res_dir); +} + /* * Directory block splitting, compacting @@ -1098,11 +1108,13 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 ** res_dir) +int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; @@ -1110,8 +1122,8 @@ static inline int search_dirblock(struct buffer_head *bh, const char *name = d_name->name; int namelen = d_name->len; - de = (struct ext4_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ -- cgit v1.2.1 From e8e948e7802a2ab05c146d3e72a39b93b5718236 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: let ext4_find_entry handle inline data Create a new function ext4_find_inline_entry() to handle the case of inline data. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 10 +++++++++- fs/ext4/xattr.h | 13 +++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 471504133c76..0a8f5a865496 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1462,6 +1462,54 @@ out: return ret; } +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + int ret; + struct ext4_iloc iloc; + void *inline_start; + int inline_size; + + if (ext4_get_inode_loc(dir, &iloc)) + return NULL; + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(iloc.bh); + iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return iloc.bh; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d50684b91496..b498cafed12b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1015,7 +1015,6 @@ static inline int search_dirblock(struct buffer_head *bh, d_name, offset, res_dir); } - /* * Directory block splitting, compacting */ @@ -1198,6 +1197,15 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, d_name, res_dir, + &has_inline_data); + if (has_inline_data) + return ret; + } + if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 539e6a08c95f..c6f3dea88d6f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -171,6 +171,10 @@ extern int ext4_try_create_inline_dir(handle_t *handle, extern int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -355,6 +359,15 @@ static inline int ext4_read_inline_dir(struct file *filp, { return 0; } + +static inline struct buffer_head * +ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 05019a9e7f025133f20c67677c9c8551eca3c6dc Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: make ext4_delete_entry generic Currently ext4_delete_entry() is used only for dir entry removing from a dir block. So let us create a new function ext4_generic_delete_entry and this function takes a entry_buf and a buf_size so that it can be used for inline data. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 ++++++ fs/ext4/namei.c | 72 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a971b65bf5ca..6cfe546282dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2129,6 +2129,13 @@ extern int search_dir(struct buffer_head *bh, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b498cafed12b..c10fc2631ff5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2109,37 +2109,29 @@ cleanup: } /* - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry */ -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) +int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; - int i, err; - - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) - csum_size = sizeof(struct ext4_dir_entry_tail); + int i; i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size - csum_size) { + de = (struct ext4_dir_entry_2 *)entry_buf; + while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -2150,12 +2142,6 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, dir, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2165,6 +2151,40 @@ static int ext4_delete_entry(handle_t *handle, return -ENOENT; } +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, + bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + /* * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, * since this indicates that nlinks count was previously 1. -- cgit v1.2.1 From 9f40fe54635b7533f51993d0f5e7f014fc14d33a Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: let ext4_delete_entry() handle inline data Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 8 ++++++++ fs/ext4/xattr.h | 13 +++++++++++++ 3 files changed, 76 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 0a8f5a865496..f5e9c0e6d737 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1510,6 +1510,61 @@ out_find: return iloc.bh; } +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_mark_inode_dirty(handle, dir); + if (unlikely(err)) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c10fc2631ff5..a32228a73df0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2158,6 +2158,14 @@ static int ext4_delete_entry(handle_t *handle, { int err, csum_size = 0; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c6f3dea88d6f..f86e424d75e4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -175,6 +175,11 @@ extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -368,6 +373,14 @@ ext4_find_inline_entry(struct inode *dir, { return NULL; } +static inline int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 61f86638d8a656101bb0f9c41c55d9685f8a2357 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:01 -0500 Subject: ext4: let empty_dir handle inline dir empty_dir is used when deleting a dir. So it should handle inline dir properly. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 8 +++++ fs/ext4/xattr.h | 6 ++++ 3 files changed, 104 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f5e9c0e6d737..e5da458fabad 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1565,6 +1565,96 @@ out: return err; } +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + int ret = 1; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", + err, dir->i_ino); + return 1; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + ret = 1; + goto out; + } + + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < dir->i_size) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d\n", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + ret = 1; + goto out; + } + if (le32_to_cpu(de->inode)) { + ret = 0; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a32228a73df0..e3e20d0aa299 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2464,6 +2464,14 @@ static int empty_dir(struct inode *inode) struct super_block *sb; int err = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + + err = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return err; + } + sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f86e424d75e4..7747bbcebb33 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -180,6 +180,7 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -381,6 +382,11 @@ static inline int ext4_delete_inline_entry(handle_t *handle, { return 0; } + +static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 32f7f22c0b52e8189fef83986b16dc7abe95f2c4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:01 -0500 Subject: ext4: let ext4_rename handle inline dir In case we rename a directory, ext4_rename has to read the dir block and change its dotdot's information. The old ext4_rename encapsulated the dir_block read into itself. So this patch adds a new function ext4_get_first_dir_block() which gets the dir buffer information so the ext4_rename can handle it properly. As it will also change the parent inode number, we return the parent_de so that ext4_rename() can handle it more easily. ext4_find_entry is also changed so that the caller(rename) can tell whether the found entry is an inlined one or not and journaling the corresponding buffer head. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 15 ++++++++ fs/ext4/namei.c | 109 +++++++++++++++++++++++++++++++++++++------------------ fs/ext4/xattr.h | 11 ++++++ 3 files changed, 100 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e5da458fabad..fc3629980925 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1424,6 +1424,21 @@ out: return ret; } +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e3e20d0aa299..b37c21839833 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1176,7 +1176,8 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, */ static struct buffer_head * ext4_find_entry (struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir) + struct ext4_dir_entry_2 **res_dir, + int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; @@ -1202,8 +1203,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, int has_inline_data = 1; ret = ext4_find_inline_entry(dir, d_name, res_dir, &has_inline_data); - if (has_inline_data) + if (has_inline_data) { + if (inlined) + *inlined = 1; return ret; + } } if ((namelen <= 2) && (name[0] == '.') && @@ -1390,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1424,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -2725,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; @@ -2790,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_handle_sync(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_unlink; @@ -2972,8 +2976,39 @@ retry: return err; } -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { + if (!*retval) { + *retval = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + return NULL; + } + *parent_de = ext4_next_entry( + (struct ext4_dir_entry_2 *)bh->b_data, + inode->i_sb->s_blocksize); + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} /* * Anybody can rename anything with this: the permission checks are left to the @@ -2987,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + int inlined = 0, new_inlined = 0; + struct ext4_dir_entry_2 *parent_de; dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -3006,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) ext4_handle_sync(handle); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3019,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, + &new_de, &new_inlined); if (new_bh) { if (!new_inode) { brelse(new_bh); @@ -3033,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { - if (!retval) { - retval = -EIO; - ext4_error(old_inode->i_sb, - "Directory hole detected on inode %lu\n", - old_inode->i_ino); - } + dir_bh = ext4_get_first_dir_block(handle, old_inode, + &retval, &parent_de, + &inlined); + if (!dir_bh) goto end_rename; - } - if (!buffer_verified(dir_bh) && + if (!inlined && !buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) goto end_rename; set_buffer_verified(dir_bh); - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -3077,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; + if (!new_inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } } brelse(new_bh); new_bh = NULL; @@ -3108,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, + &old_de2, NULL); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -3128,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); + parent_de->inode = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); + if (!inlined) { + if (is_dx(old_inode)) { + retval = ext4_handle_dirty_dx_node(handle, + old_inode, + dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + old_inode, dir_bh); + } } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, - dir_bh); + retval = ext4_mark_inode_dirty(handle, old_inode); } if (retval) { ext4_std_error(old_dir->i_sb, retval); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7747bbcebb33..f6c3ca6dae46 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -181,6 +181,9 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data); extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -387,6 +390,14 @@ static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) { return 0; } + +static inline struct buffer_head * +ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 941919856c11d4dd11d4fcabb4dab58bd2b146bf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: let fiemap work with inline data fiemap is used to find the disk layout of a file, as for inline data, let us just pretend like a file with just one extent. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 9 +++++++++ fs/ext4/inline.c | 35 +++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 10 ++++++++++ 3 files changed, 54 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2659f51b23d..70dc6fc53a00 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4802,6 +4802,15 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_lblk_t start_blk; int error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + + if (has_inline) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index fc3629980925..bf5f77803885 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -15,6 +15,7 @@ #include "ext4.h" #include "xattr.h" #include "truncate.h" +#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -1680,3 +1681,37 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) return ret; } + +int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + int error = 0; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + goto out; + } + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + physical += offsetof(struct ext4_inode, i_block); + length = i_size_read(inode); + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + brelse(iloc.bh); +out: + up_read(&EXT4_I(inode)->xattr_sem); + return (error < 0 ? error : 0); +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f6c3ca6dae46..5c7e55edfe6c 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -184,6 +184,9 @@ extern int empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -398,6 +401,13 @@ ext4_get_first_inline_block(struct inode *inode, { return NULL; } + +static inline int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 0d812f77b36c16dff692390508155de2c7f95ea3 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: evict inline data out if we need to strore xattr in inode Now we that store data in the inode, in case we need to store some xattrs and inode doesn't have enough space, Andreas suggested that we should keep the xattr(metadata) in and data should be pushed out. So this patch does the work. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- fs/ext4/xattr.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ext4/xattr.h | 9 ++++++--- 3 files changed, 99 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bf5f77803885..cec651e2646c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -207,8 +207,8 @@ out: /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr - * value since it is already handled by ext4_xattr_ibody_set. That saves - * us one memcpy. + * value since it is already handled by ext4_xattr_ibody_inline_set. + * That saves us one memcpy. */ void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) @@ -285,7 +285,7 @@ static int ext4_create_inline_data(handle_t *handle, BUG_ON(!is.s.not_found); - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, @@ -354,7 +354,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, i.value = value; i.value_len = len; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; @@ -427,7 +427,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; @@ -1715,3 +1715,41 @@ out: up_read(&EXT4_I(inode)->xattr_sem); return (error < 0 ? error : 0); } + +/* + * Called during xattr set, and if we can sparse space 'needed', + * just create the extent tree evict the data to the outer block. + * + * We use jbd2 instead of page cache to move data to the 1st block + * so that the whole transaction can be committed as a whole and + * the data isn't lost because of the delayed page cache write. + */ +int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed) +{ + int error; + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + if (EXT4_XATTR_LEN(entry->e_name_len) + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { + error = -ENOSPC; + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +out: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a47dc3883a23..2251769a3c53 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -958,9 +958,47 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { + error = ext4_try_to_evict_inline_data(handle, inode, + EXT4_XATTR_LEN(strlen(i->name) + + EXT4_XATTR_SIZE(i->value_len))); + if (error) + return error; + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; + error = ext4_xattr_set_entry(i, s); + } + if (error) + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; @@ -1116,9 +1154,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + /* + * In case of inline data, we may push out the data to a block, + * So reserve the journal space first. + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 5c7e55edfe6c..1be243aab01b 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -122,9 +122,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size); -extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); extern int ext4_has_inline_data(struct inode *inode); extern int ext4_get_inline_size(struct inode *inode); @@ -187,6 +187,9 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); # else /* CONFIG_EXT4_FS_XATTR */ static inline int -- cgit v1.2.1 From aef1c8513c1f8ae076e22ea2a57eff5835578e75 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: let ext4_truncate handle inline data correctly Signed-off-by: Robin Dong Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 8 +++++ fs/ext4/xattr.h | 9 ++++++ 3 files changed, 107 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cec651e2646c..727edb8d57e0 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1753,3 +1753,93 @@ out: brelse(iloc.bh); return error; } + +void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) + return; + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + ext4_journal_stop(handle); + return; + } + + if (ext4_orphan_add(handle, inode)) + goto out; + + if (ext4_get_inode_loc(inode, &is.iloc)) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if (ext4_xattr_ibody_find(inode, &i, &is)) + goto out_error; + + BUG_ON(is.s.not_found); + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) + goto out_error; + + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, value_len)) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) + memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + up_write(&EXT4_I(inode)->xattr_sem); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); + return; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f16ae02599cd..cb1c1ab2720b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3594,6 +3594,14 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + ext4_inline_data_truncate(inode, &has_inline); + if (has_inline) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); else diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1be243aab01b..1a71a97e14ad 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -190,6 +190,8 @@ extern int ext4_inline_data_fiemap(struct inode *inode, extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -411,6 +413,13 @@ static inline int ext4_inline_data_fiemap(struct inode *inode, { return 0; } + +static inline void ext4_inline_data_truncate(struct inode *inode, + int *has_inline) +{ + return; +} + # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From 0c8d414f163f5d35e43a4de7a6e5ee8c253fcccf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: let fallocate handle inline data correctly If we are punching hole in a file, we will return ENOTSUPP. As for the fallocation of some extents, we will convert the inline data to a normal extent based file first. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++++ fs/ext4/inline.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 5 +++++ 3 files changed, 48 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 70dc6fc53a00..d45ff3faefc6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4399,6 +4399,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(file, offset, len); + ret = ext4_convert_inline_data(inode); + if (ret) + return ret; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 727edb8d57e0..53b2f65091dd 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1843,3 +1843,42 @@ out: ext4_journal_stop(handle); return; } + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_write(&EXT4_I(inode)->xattr_sem); + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + up_write(&EXT4_I(inode)->xattr_sem); +out: + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1a71a97e14ad..4222388c772f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -192,6 +192,7 @@ extern int ext4_try_to_evict_inline_data(handle_t *handle, int needed); extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); +extern int ext4_convert_inline_data(struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -420,6 +421,10 @@ static inline void ext4_inline_data_truncate(struct inode *inode, return; } +static inline int ext4_convert_inline_data(struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.1 From f08225d176a5736363beea653b9b3fb9400c1255 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: enable ext4 inline support Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ialloc.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6cfe546282dc..b90e2720b826 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1529,7 +1529,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c7efa88d7149..3f32c8012447 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -902,6 +902,10 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; dquot_initialize(inode); err = dquot_alloc_inode(inode); -- cgit v1.2.1 From 64744e03c6871e5e4678478bab1b8c3ba6cca395 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: use sync_inode_metadata() when syncing inode metadata We have a dedicated interface to sync inode metadata. Use it to simplify ext4's code some. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/fsync.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index be1d89f385b4..dfbc1fe96674 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,7 +44,6 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct writeback_control wbc; struct dentry *dentry = NULL; struct inode *next; int ret = 0; @@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode) ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; - memset(&wbc, 0, sizeof(wbc)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.nr_to_write = 0; /* only write out the inode */ - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); if (ret) break; } -- cgit v1.2.1 From a789f49c9272e81f4f52487e94820182d0a2d2ff Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove redundant code in ext4_alloc_inode() inode_init_always() will initialize inode->i_data.writeback_index anyway, no need to do this in ext4_alloc_inode(). Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 856206f255aa..c2ea525e85c6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -939,7 +939,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); -- cgit v1.2.1 From 6b280c913ee02a1a41b020a74c41584f2fca582a Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove redundant initialization in ext4_fill_super() We use kzalloc() to allocate sbi, no need to zero its field. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c2ea525e85c6..e1e216f8e9bd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3798,7 +3798,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - sbi->s_resize_flags = 0; sb->s_root = NULL; -- cgit v1.2.1 From 187fd030d801b02b0daeb010dbf7c0113be3156d Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove unused variable from ext4_ext_in_cache() Signed-off-by: "Theodore Ts'o" Signed-off-by: Zhi Yong Wu Reviewed-by: Zheng Liu --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d45ff3faefc6..26af22832a84 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2194,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; int ret = 0; /* @@ -2202,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) -- cgit v1.2.1 From 9a9c6478a8b6ce8b6da6b6d1e15f365b505895cd Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 4 Dec 2012 14:29:27 +0300 Subject: nfsd: make NFSv4 recovery client tracking options per net Pointer to client tracking operations - client_tracking_ops - have to be containerized, because different environment can support different trackers (for example, legacy tracker currently is not suported in container). Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4recover.c | 48 ++++++++++++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 964b5542f027..fac4123c918c 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -35,6 +35,7 @@ #define SESSION_HASH_SIZE 512 struct cld_net; +struct nfsd4_client_tracking_ops; struct nfsd_net { struct cld_net *cld_net; @@ -87,6 +88,7 @@ struct nfsd_net { struct file *rec_file; bool in_grace; + struct nfsd4_client_tracking_ops *client_tracking_ops; time_t nfsd4_lease; time_t nfsd4_grace; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 359793f89493..ba6fdd4a0455 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -63,7 +63,6 @@ struct nfsd4_client_tracking_ops { /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static struct nfsd4_client_tracking_ops *client_tracking_ops; static int nfs4_save_creds(const struct cred **original_creds) @@ -1262,17 +1261,18 @@ nfsd4_client_tracking_init(struct net *net) { int status; struct path path; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* just run the init if it the method is already decided */ - if (client_tracking_ops) + if (nn->client_tracking_ops) goto do_init; /* * First, try a UMH upcall. It should succeed or fail quickly, so * there's little harm in trying that first. */ - client_tracking_ops = &nfsd4_umh_tracking_ops; - status = client_tracking_ops->init(net); + nn->client_tracking_ops = &nfsd4_umh_tracking_ops; + status = nn->client_tracking_ops->init(net); if (!status) return status; @@ -1280,7 +1280,7 @@ nfsd4_client_tracking_init(struct net *net) * See if the recoverydir exists and is a directory. If it is, * then use the legacy ops. */ - client_tracking_ops = &nfsd4_legacy_tracking_ops; + nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { status = S_ISDIR(path.dentry->d_inode->i_mode); @@ -1290,16 +1290,16 @@ nfsd4_client_tracking_init(struct net *net) } /* Finally, try to use nfsdcld */ - client_tracking_ops = &nfsd4_cld_tracking_ops; + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be " "removed in 3.10. Please transition to using " "nfsdcltrack.\n"); do_init: - status = client_tracking_ops->init(net); + status = nn->client_tracking_ops->init(net); if (status) { printk(KERN_WARNING "NFSD: Unable to initialize client " "recovery tracking! (%d)\n", status); - client_tracking_ops = NULL; + nn->client_tracking_ops = NULL; } return status; } @@ -1307,32 +1307,40 @@ do_init: void nfsd4_client_tracking_exit(struct net *net) { - if (client_tracking_ops) { - if (client_tracking_ops->exit) - client_tracking_ops->exit(net); - client_tracking_ops = NULL; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->client_tracking_ops) { + if (nn->client_tracking_ops->exit) + nn->client_tracking_ops->exit(net); + nn->client_tracking_ops = NULL; } } void nfsd4_client_record_create(struct nfs4_client *clp) { - if (client_tracking_ops) - client_tracking_ops->create(clp); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->create(clp); } void nfsd4_client_record_remove(struct nfs4_client *clp) { - if (client_tracking_ops) - client_tracking_ops->remove(clp); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->remove(clp); } int nfsd4_client_record_check(struct nfs4_client *clp) { - if (client_tracking_ops) - return client_tracking_ops->check(clp); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + return nn->client_tracking_ops->check(clp); return -EOPNOTSUPP; } @@ -1340,8 +1348,8 @@ nfsd4_client_record_check(struct nfs4_client *clp) void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) { - if (client_tracking_ops) - client_tracking_ops->grace_done(nn, boot_time); + if (nn->client_tracking_ops) + nn->client_tracking_ops->grace_done(nn, boot_time); } static int -- cgit v1.2.1 From 7007c90fb9fef593b4aeaeee57e6a6754276c97c Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Fri, 7 Dec 2012 15:40:55 -0500 Subject: nfsd: avoid permission checks on EXCLUSIVE_CREATE replay With NFSv4, if we create a file then open it we explicit avoid checking the permissions on the file during the open because the fact that we created it ensures we should be allow to open it (the create and the open should appear to be a single operation). However if the reply to an EXCLUSIVE create gets lots and the client resends the create, the current code will perform the permission check - because it doesn't realise that it did the open already.. This patch should fix this. Note that I haven't actually seen this cause a problem. I was just looking at the code trying to figure out a different EXCLUSIVE open related issue, and this looked wrong. (Fix confirmed with pynfs 4.0 test OPEN4--bfields) Cc: stable@kernel.org Signed-off-by: NeilBrown [bfields: use OWNER_OVERRIDE and update for 4.1] Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 8 +++++--- fs/nfsd/vfs.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 87d24e5f3ca4..1a0b1fdb5ad3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -195,6 +195,7 @@ static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct svc_fh *resfh; + int accmode; __be32 status; resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); @@ -254,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o /* set reply cache */ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh->fh_handle); - if (!open->op_created) - status = do_open_permission(rqstp, resfh, open, - NFSD_MAY_NOP); + accmode = NFSD_MAY_NOP; + if (open->op_created) + accmode |= NFSD_MAY_OWNER_OVERRIDE; + status = do_open_permission(rqstp, resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); fh_dup2(current_fh, resfh); out: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b584205b25b4..0ef9b6b410a2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1471,13 +1471,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_EXCLUSIVE: if ( dchild->d_inode->i_mtime.tv_sec == v_mtime && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) + && dchild->d_inode->i_size == 0 ) { + if (created) + *created = 1; break; + } case NFS4_CREATE_EXCLUSIVE4_1: if ( dchild->d_inode->i_mtime.tv_sec == v_mtime && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) + && dchild->d_inode->i_size == 0 ) { + if (created) + *created = 1; goto set_attr; + } /* fallthru */ case NFS3_CREATE_GUARDED: err = nfserr_exist; -- cgit v1.2.1 From f7fb86c6e639360ad9c253cec534819ef928a674 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:04 +0300 Subject: nfsd: use "init_net" for portmapper There could be a situation, when NFSd was started in one network namespace, but stopped in another one. This will trigger kernel panic, because RPCBIND client is stored on per-net NFSd data, and will be NULL on NFSd shutdown. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b34a67d8ec44..9beace6a868c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -341,7 +340,7 @@ static int nfsd_get_default_max_blksize(void) int nfsd_create_serv(void) { int error; - struct net *net = current->nsproxy->net_ns; + struct net *net = &init_net; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { -- cgit v1.2.1 From db6e182c17cb1a7069f7f8924721ce58ac05d9a3 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:09 +0300 Subject: nfsd: pass net to nfsd_init_socks() Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9beace6a868c..9fd8496d5b84 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -182,18 +182,18 @@ int nfsd_nrthreads(void) return rv; } -static int nfsd_init_socks(void) +static int nfsd_init_socks(struct net *net) { int error; if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT, + error = svc_create_xprt(nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS); if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT, + error = svc_create_xprt(nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -218,7 +218,7 @@ static int nfsd_startup(int nrservs) ret = nfsd_racache_init(2*nrservs); if (ret) return ret; - ret = nfsd_init_socks(); + ret = nfsd_init_socks(net); if (ret) goto out_racache; ret = lockd_up(net); -- cgit v1.2.1 From db42d1a76a8dfcaba7a2dc9c591fa4e231db22b3 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:14 +0300 Subject: nfsd: pass net to nfsd_startup() and nfsd_shutdown() Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9fd8496d5b84..21cba3d7c865 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -203,10 +203,9 @@ static int nfsd_init_socks(struct net *net) static bool nfsd_up = false; -static int nfsd_startup(int nrservs) +static int nfsd_startup(int nrservs, struct net *net) { int ret; - struct net *net = &init_net; if (nfsd_up) return 0; @@ -237,16 +236,14 @@ static int nfsd_startup(int nrservs) out_net_state: nfs4_state_shutdown(); out_lockd: - lockd_down(&init_net); + lockd_down(net); out_racache: nfsd_racache_shutdown(); return ret; } -static void nfsd_shutdown(void) +static void nfsd_shutdown(struct net *net) { - struct net *net = &init_net; - /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -264,7 +261,7 @@ static void nfsd_shutdown(void) static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { - nfsd_shutdown(); + nfsd_shutdown(net); svc_rpcb_cleanup(serv, net); @@ -468,7 +465,7 @@ nfsd_svc(int nrservs) nfsd_up_before = nfsd_up; - error = nfsd_startup(nrservs); + error = nfsd_startup(nrservs, net); if (error) goto out_destroy; error = svc_set_num_threads(nfsd_serv, NULL, nrservs); @@ -481,7 +478,7 @@ nfsd_svc(int nrservs) error = nfsd_serv->sv_nrthreads - 1; out_shutdown: if (error < 0 && !nfsd_up_before) - nfsd_shutdown(); + nfsd_shutdown(net); out_destroy: nfsd_destroy(net); /* Release server */ out: -- cgit v1.2.1 From 6777436b0f072fb20a025a73e9b67a35ad8a5451 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:20 +0300 Subject: nfsd: pass net to nfsd_create_serv() Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 4 ++-- fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfssvc.c | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e13cbddcdbd0..ae1d14313ef8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -664,7 +664,7 @@ static ssize_t __write_ports_addfd(char *buf) if (err != 0 || fd < 0) return -EINVAL; - err = nfsd_create_serv(); + err = nfsd_create_serv(net); if (err != 0) return err; @@ -696,7 +696,7 @@ static ssize_t __write_ports_addxprt(char *buf) if (port < 1 || port > USHRT_MAX) return -EINVAL; - err = nfsd_create_serv(); + err = nfsd_create_serv(net); if (err != 0) return err; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 5eea0f5021fd..acddf71abd51 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -103,7 +103,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; int nfsd_vers(int vers, enum vers_op change); int nfsd_minorversion(u32 minorversion, enum vers_op change); void nfsd_reset_versions(void); -int nfsd_create_serv(void); +int nfsd_create_serv(struct net *net); extern int nfsd_max_blksize; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 21cba3d7c865..6448391cde54 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -334,10 +334,9 @@ static int nfsd_get_default_max_blksize(void) return ret; } -int nfsd_create_serv(void) +int nfsd_create_serv(struct net *net) { int error; - struct net *net = &init_net; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { @@ -459,7 +458,7 @@ nfsd_svc(int nrservs) if (nrservs == 0 && nfsd_serv == NULL) goto out; - error = nfsd_create_serv(); + error = nfsd_create_serv(net); if (error) goto out; -- cgit v1.2.1 From d41a9417cd89a69f58a26935034b4264a2d882d6 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:25 +0300 Subject: nfsd: pass net to nfsd_svc() Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 4 +++- fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfssvc.c | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ae1d14313ef8..68e229cdfd63 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -396,6 +396,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; + struct net *net = &init_net; + if (size > 0) { int newthreads; rv = get_int(&mesg, &newthreads); @@ -403,7 +405,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return rv; if (newthreads < 0) return -EINVAL; - rv = nfsd_svc(newthreads); + rv = nfsd_svc(newthreads, net); if (rv < 0) return rv; } else diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index acddf71abd51..8226c1b02558 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -65,7 +65,7 @@ extern const struct seq_operations nfs_exports_op; /* * Function prototypes. */ -int nfsd_svc(int nrservs); +int nfsd_svc(int nrservs, struct net *net); int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); int nfsd_nrthreads(void); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6448391cde54..f199b537cc81 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -442,11 +442,10 @@ int nfsd_set_nrthreads(int n, int *nthreads) * this is the first time nrservs is nonzero. */ int -nfsd_svc(int nrservs) +nfsd_svc(int nrservs, struct net *net) { int error; bool nfsd_up_before; - struct net *net = &init_net; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); -- cgit v1.2.1 From 3938a0d5eb5effcc89c6909741403f4e6a37252d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:30 +0300 Subject: nfsd: pass net to nfsd_set_nrthreads() Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 3 ++- fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfssvc.c | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 68e229cdfd63..58f0ae44779d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -447,6 +447,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int len; int npools; int *nthreads; + struct net *net = &init_net; mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(); @@ -477,7 +478,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) if (nthreads[i] < 0) goto out_free; } - rv = nfsd_set_nrthreads(i, nthreads); + rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) goto out_free; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 8226c1b02558..18f999665546 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -71,7 +71,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); int nfsd_nrthreads(void); int nfsd_nrpools(void); int nfsd_get_nrthreads(int n, int *); -int nfsd_set_nrthreads(int n, int *); +int nfsd_set_nrthreads(int n, int *, struct net *); int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f199b537cc81..b144658c49b2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -382,12 +382,11 @@ int nfsd_get_nrthreads(int n, int *nthreads) return 0; } -int nfsd_set_nrthreads(int n, int *nthreads) +int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; - struct net *net = &init_net; WARN_ON(!mutex_is_locked(&nfsd_mutex)); -- cgit v1.2.1 From 081603520b25f7b35ef63a363376a17c36ef74ed Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 10 Dec 2012 12:19:35 +0300 Subject: nfsd: pass net to __write_ports() and down Precursor patch. Hard-coded "init_net" will be replaced by proper one in future. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 58f0ae44779d..8536100b7fc1 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -657,11 +657,10 @@ static ssize_t __write_ports_names(char *buf) * a socket of a supported family/protocol, and we use it as an * nfsd listener. */ -static ssize_t __write_ports_addfd(char *buf) +static ssize_t __write_ports_addfd(char *buf, struct net *net) { char *mesg = buf; int fd, err; - struct net *net = &init_net; err = get_int(&mesg, &fd); if (err != 0 || fd < 0) @@ -686,12 +685,11 @@ static ssize_t __write_ports_addfd(char *buf) * A transport listener is added by writing it's transport name and * a port number. */ -static ssize_t __write_ports_addxprt(char *buf) +static ssize_t __write_ports_addxprt(char *buf, struct net *net) { char transport[16]; struct svc_xprt *xprt; int port, err; - struct net *net = &init_net; if (sscanf(buf, "%15s %5u", transport, &port) != 2) return -EINVAL; @@ -727,16 +725,17 @@ out_err: return err; } -static ssize_t __write_ports(struct file *file, char *buf, size_t size) +static ssize_t __write_ports(struct file *file, char *buf, size_t size, + struct net *net) { if (size == 0) return __write_ports_names(buf); if (isdigit(buf[0])) - return __write_ports_addfd(buf); + return __write_ports_addfd(buf, net); if (isalpha(buf[0])) - return __write_ports_addxprt(buf); + return __write_ports_addxprt(buf, net); return -EINVAL; } @@ -787,9 +786,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; + struct net *net = &init_net; mutex_lock(&nfsd_mutex); - rv = __write_ports(file, buf, size); + rv = __write_ports(file, buf, size, net); mutex_unlock(&nfsd_mutex); return rv; } -- cgit v1.2.1 From 6ff50b3dea9a242b50642a703b513986bffb8ce9 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:09 +0300 Subject: nfsd: move per-net startup code to separated function NFSd resources are partially per-net and partially globally used. This patch splits resources init and shutdown and moves per-net code to separated functions. Generic and per-net init and shutdown are called sequentially for a while. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b144658c49b2..038348bc1a09 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -203,6 +203,27 @@ static int nfsd_init_socks(struct net *net) static bool nfsd_up = false; +static int nfsd_startup_net(struct net *net) +{ + int ret; + + ret = nfsd_init_socks(net); + if (ret) + return ret; + ret = lockd_up(net); + if (ret) + return ret; + ret = nfs4_state_start_net(net); + if (ret) + goto out_lockd; + + return 0; + +out_lockd: + lockd_down(net); + return ret; +} + static int nfsd_startup(int nrservs, struct net *net) { int ret; @@ -217,31 +238,29 @@ static int nfsd_startup(int nrservs, struct net *net) ret = nfsd_racache_init(2*nrservs); if (ret) return ret; - ret = nfsd_init_socks(net); - if (ret) - goto out_racache; - ret = lockd_up(net); - if (ret) - goto out_racache; ret = nfs4_state_start(); if (ret) - goto out_lockd; - - ret = nfs4_state_start_net(net); + goto out_racache; + ret = nfsd_startup_net(net); if (ret) - goto out_net_state; + goto out_net; nfsd_up = true; return 0; -out_net_state: + +out_net: nfs4_state_shutdown(); -out_lockd: - lockd_down(net); out_racache: nfsd_racache_shutdown(); return ret; } +static void nfsd_shutdown_net(struct net *net) +{ + nfs4_state_shutdown_net(net); + lockd_down(net); +} + static void nfsd_shutdown(struct net *net) { /* @@ -252,9 +271,8 @@ static void nfsd_shutdown(struct net *net) */ if (!nfsd_up) return; - nfs4_state_shutdown_net(net); + nfsd_shutdown_net(net); nfs4_state_shutdown(); - lockd_down(net); nfsd_racache_shutdown(); nfsd_up = false; } -- cgit v1.2.1 From 2c2fe2909e124c32a34dbbb3ac129112524fc540 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:14 +0300 Subject: nfsd: per-net NFSd up flag introduced This patch introduces introduces per-net "nfsd_net_up" boolean flag, which has the same purpose as general "nfsd_up" flag - skip init or shutdown of per-net resources in case of they are inited on shutted down respectively. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfssvc.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index fac4123c918c..543ac486fd5d 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -92,6 +92,8 @@ struct nfsd_net { time_t nfsd4_lease; time_t nfsd4_grace; + + bool nfsd_net_up; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 038348bc1a09..6e17efdd8afe 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -21,6 +21,7 @@ #include "nfsd.h" #include "cache.h" #include "vfs.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -205,8 +206,12 @@ static bool nfsd_up = false; static int nfsd_startup_net(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; + if (nn->nfsd_net_up) + return 0; + ret = nfsd_init_socks(net); if (ret) return ret; @@ -217,6 +222,7 @@ static int nfsd_startup_net(struct net *net) if (ret) goto out_lockd; + nn->nfsd_net_up = true; return 0; out_lockd: @@ -257,8 +263,14 @@ out_racache: static void nfsd_shutdown_net(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->nfsd_net_up) + return; + nfs4_state_shutdown_net(net); lockd_down(net); + nn->nfsd_net_up = false; } static void nfsd_shutdown(struct net *net) -- cgit v1.2.1 From b9c0ef8571c6ae33465dcf41d496ce2ad783c49d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:19 +0300 Subject: nfsd: make NFSd service boot time per-net This is simple: an NFSd service can be started at different times in different network environments. So, its "boot time" has to be assigned per net. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 5 +++++ fs/nfsd/nfs3xdr.c | 14 ++++++++++---- fs/nfsd/nfs4proc.c | 11 ++++++----- fs/nfsd/nfsd.h | 5 ----- fs/nfsd/nfssvc.c | 4 ++-- 5 files changed, 23 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 543ac486fd5d..3b283eaab10d 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -94,6 +94,11 @@ struct nfsd_net { time_t nfsd4_grace; bool nfsd_net_up; + + /* + * Time of server startup + */ + struct timeval nfssvc_boot; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 43f46cd9edea..2b8618de6c27 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -7,8 +7,10 @@ */ #include +#include #include "xdr3.h" #include "auth.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -720,12 +722,14 @@ int nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_writeres *resp) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); - *p++ = htonl(nfssvc_boot.tv_sec); - *p++ = htonl(nfssvc_boot.tv_usec); + *p++ = htonl(nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_usec); } return xdr_ressize_check(rqstp, p); } @@ -1082,11 +1086,13 @@ int nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_commitres *resp) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { - *p++ = htonl(nfssvc_boot.tv_sec); - *p++ = htonl(nfssvc_boot.tv_usec); + *p++ = htonl(nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_usec); } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1a0b1fdb5ad3..bd67f4d6dfc6 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -497,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &access->ac_supported); } -static void gen_boot_verifier(nfs4_verifier *verifier) +static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) { __be32 verf[2]; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - verf[0] = (__be32)nfssvc_boot.tv_sec; - verf[1] = (__be32)nfssvc_boot.tv_usec; + verf[0] = (__be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__be32)nn->nfssvc_boot.tv_usec; memcpy(verifier->data, verf, sizeof(verifier->data)); } @@ -510,7 +511,7 @@ static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - gen_boot_verifier(&commit->co_verf); + gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp)); return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); } @@ -930,7 +931,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; - gen_boot_verifier(&write->wr_verifier); + gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); nvecs = fill_in_write_vector(rqstp->rq_vec, write); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 18f999665546..71ba60d36234 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -269,11 +269,6 @@ void nfsd_lockd_shutdown(void); /* Check for dir entries '.' and '..' */ #define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) -/* - * Time of server startup - */ -extern struct timeval nfssvc_boot; - #ifdef CONFIG_NFSD_V4 /* before processing a COMPOUND operation, we have to check that there diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6e17efdd8afe..40992cd5bff9 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -27,7 +27,6 @@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); -struct timeval nfssvc_boot; /* * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members @@ -367,6 +366,7 @@ static int nfsd_get_default_max_blksize(void) int nfsd_create_serv(struct net *net) { int error; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { @@ -388,7 +388,7 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); - do_gettimeofday(&nfssvc_boot); /* record boot time */ + do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } -- cgit v1.2.1 From 9dd9845f084cda07ce00cca32a5ba8fbcbbfbcaf Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:24 +0300 Subject: nfsd: make NFSd service structure allocated per net This patch makes main step in NFSd containerisation. There could be different approaches to how to make NFSd able to handle incoming RPC request from different network namespaces. The two main options are: 1) Share NFSd kthreads betwween all network namespaces. 2) Create separated pool of threads for each namespace. While first approach looks more flexible, second one is simpler and non-racy. This patch implements the second option. To make it possible to allocate separate pools of threads, we have to make it possible to allocate separate NFSd service structures per net. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4state.c | 14 ++++++--- fs/nfsd/nfsctl.c | 63 ++++++++++++++++++++++--------------- fs/nfsd/nfsd.h | 18 +++-------- fs/nfsd/nfssvc.c | 91 +++++++++++++++++++++++++++++++++-------------------- 5 files changed, 110 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 3b283eaab10d..1051bebff1b0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -99,6 +99,8 @@ struct nfsd_net { * Time of server startup */ struct timeval nfssvc_boot; + + struct svc_serv *nfsd_serv; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16e954c1c911..3d27f08e2297 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -743,9 +743,12 @@ out_free: return NULL; } -static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) +static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, + struct nfsd4_channel_attrs *req, + int numslots, int slotsize, + struct nfsd_net *nn) { - u32 maxrpc = nfsd_serv->sv_max_mesg; + u32 maxrpc = nn->nfsd_serv->sv_max_mesg; new->maxreqs = numslots; new->maxresp_cached = min_t(u32, req->maxresp_cached, @@ -883,7 +886,8 @@ void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, + struct nfsd_net *nn) { struct nfsd4_session *new; int numslots, slotsize; @@ -904,7 +908,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) nfsd4_put_drc_mem(slotsize, fchan->maxreqs); return NULL; } - init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); + init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn); return new; } @@ -1776,7 +1780,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, return nfserr_inval; if (check_forechannel_attrs(cr_ses->fore_channel)) return nfserr_toosmall; - new = alloc_session(&cr_ses->fore_channel); + new = alloc_session(&cr_ses->fore_channel, nn); if (!new) return nfserr_jukebox; status = nfserr_jukebox; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8536100b7fc1..74934284d9a7 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -409,7 +409,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) if (rv < 0) return rv; } else - rv = nfsd_nrthreads(); + rv = nfsd_nrthreads(net); return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); } @@ -450,7 +450,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) struct net *net = &init_net; mutex_lock(&nfsd_mutex); - npools = nfsd_nrpools(); + npools = nfsd_nrpools(net); if (npools == 0) { /* * NFS is shut down. The admin can start it by @@ -483,7 +483,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) goto out_free; } - rv = nfsd_get_nrthreads(npools, nthreads); + rv = nfsd_get_nrthreads(npools, nthreads, net); if (rv) goto out_free; @@ -510,11 +510,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) unsigned minor; ssize_t tlen = 0; char *sep; + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (size>0) { - if (nfsd_serv) + if (nn->nfsd_serv) /* Cannot change versions without updating - * nfsd_serv->sv_xdrsize, and reallocing + * nn->nfsd_serv->sv_xdrsize, and reallocing * rq_argp and rq_resp */ return -EBUSY; @@ -645,11 +647,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) * Zero-length write. Return a list of NFSD's current listener * transports. */ -static ssize_t __write_ports_names(char *buf) +static ssize_t __write_ports_names(char *buf, struct net *net) { - if (nfsd_serv == NULL) + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) return 0; - return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); + return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); } /* @@ -661,6 +665,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net) { char *mesg = buf; int fd, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); err = get_int(&mesg, &fd); if (err != 0 || fd < 0) @@ -670,14 +675,14 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net) if (err != 0) return err; - err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); + err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); if (err < 0) { nfsd_destroy(net); return err; } /* Decrease the count, but don't shut down the service */ - nfsd_serv->sv_nrthreads--; + nn->nfsd_serv->sv_nrthreads--; return err; } @@ -690,6 +695,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net) char transport[16]; struct svc_xprt *xprt; int port, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (sscanf(buf, "%15s %5u", transport, &port) != 2) return -EINVAL; @@ -701,21 +707,21 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net) if (err != 0) return err; - err = svc_create_xprt(nfsd_serv, transport, net, + err = svc_create_xprt(nn->nfsd_serv, transport, net, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err < 0) goto out_err; - err = svc_create_xprt(nfsd_serv, transport, net, + err = svc_create_xprt(nn->nfsd_serv, transport, net, PF_INET6, port, SVC_SOCK_ANONYMOUS); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; /* Decrease the count, but don't shut down the service */ - nfsd_serv->sv_nrthreads--; + nn->nfsd_serv->sv_nrthreads--; return 0; out_close: - xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port); + xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); if (xprt != NULL) { svc_close_xprt(xprt); svc_xprt_put(xprt); @@ -729,7 +735,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size, struct net *net) { if (size == 0) - return __write_ports_names(buf); + return __write_ports_names(buf, net); if (isdigit(buf[0])) return __write_ports_addfd(buf, net); @@ -821,6 +827,9 @@ int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + if (size > 0) { int bsize; int rv = get_int(&mesg, &bsize); @@ -835,7 +844,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) bsize = NFSSVC_MAXBLKSIZE; bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); - if (nfsd_serv) { + if (nn->nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } @@ -848,13 +857,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) } #ifdef CONFIG_NFSD_V4 -static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, + time_t *time, struct nfsd_net *nn) { char *mesg = buf; int rv, i; if (size > 0) { - if (nfsd_serv) + if (nn->nfsd_serv) return -EBUSY; rv = get_int(&mesg, &i); if (rv) @@ -879,12 +889,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); } -static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, + time_t *time, struct nfsd_net *nn) { ssize_t rv; mutex_lock(&nfsd_mutex); - rv = __nfsd4_write_time(file, buf, size, time); + rv = __nfsd4_write_time(file, buf, size, time, nn); mutex_unlock(&nfsd_mutex); return rv; } @@ -913,7 +924,7 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } /** @@ -929,17 +940,18 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } -static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, + struct nfsd_net *nn) { char *mesg = buf; char *recdir; int len, status; if (size > 0) { - if (nfsd_serv) + if (nn->nfsd_serv) return -EBUSY; if (size > PATH_MAX || buf[size-1] != '\n') return -EINVAL; @@ -983,9 +995,10 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); mutex_lock(&nfsd_mutex); - rv = __write_recoverydir(file, buf, size); + rv = __write_recoverydir(file, buf, size, nn); mutex_unlock(&nfsd_mutex); return rv; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 71ba60d36234..de23db255c69 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -55,7 +55,6 @@ extern struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; -extern struct svc_serv *nfsd_serv; extern spinlock_t nfsd_drc_lock; extern unsigned int nfsd_drc_max_mem; extern unsigned int nfsd_drc_mem_used; @@ -68,23 +67,14 @@ extern const struct seq_operations nfs_exports_op; int nfsd_svc(int nrservs, struct net *net); int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); -int nfsd_nrthreads(void); -int nfsd_nrpools(void); -int nfsd_get_nrthreads(int n, int *); +int nfsd_nrthreads(struct net *); +int nfsd_nrpools(struct net *); +int nfsd_get_nrthreads(int n, int *, struct net *); int nfsd_set_nrthreads(int n, int *, struct net *); int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); -static inline void nfsd_destroy(struct net *net) -{ - int destroy = (nfsd_serv->sv_nrthreads == 1); - - if (destroy) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); - if (destroy) - nfsd_serv = NULL; -} +void nfsd_destroy(struct net *net); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 40992cd5bff9..0e8622a4341c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -29,11 +29,11 @@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); /* - * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members * of the svc_serv struct. In particular, ->sv_nrthreads but also to some * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt * - * If (out side the lock) nfsd_serv is non-NULL, then it must point to a + * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number * of nfsd threads must exist and each must listed in ->sp_all_threads in each * entry of ->sv_pools[]. @@ -51,7 +51,6 @@ static int nfsd(void *vrqstp); * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); -struct svc_serv *nfsd_serv; /* * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. @@ -172,12 +171,14 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change) */ #define NFSD_MAXSERVS 8192 -int nfsd_nrthreads(void) +int nfsd_nrthreads(struct net *net) { int rv = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + mutex_lock(&nfsd_mutex); - if (nfsd_serv) - rv = nfsd_serv->sv_nrthreads; + if (nn->nfsd_serv) + rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } @@ -185,15 +186,17 @@ int nfsd_nrthreads(void) static int nfsd_init_socks(struct net *net) { int error; - if (!list_empty(&nfsd_serv->sv_permsocks)) + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", net, PF_INET, NFS_PORT, + error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS); if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", net, PF_INET, NFS_PORT, + error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -369,21 +372,21 @@ int nfsd_create_serv(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); WARN_ON(!mutex_is_locked(&nfsd_mutex)); - if (nfsd_serv) { - svc_get(nfsd_serv); + if (nn->nfsd_serv) { + svc_get(nn->nfsd_serv); return 0; } if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(); - nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); - if (nfsd_serv == NULL) + if (nn->nfsd_serv == NULL) return -ENOMEM; - error = svc_bind(nfsd_serv, net); + error = svc_bind(nn->nfsd_serv, net); if (error < 0) { - svc_destroy(nfsd_serv); + svc_destroy(nn->nfsd_serv); return error; } @@ -392,39 +395,55 @@ int nfsd_create_serv(struct net *net) return 0; } -int nfsd_nrpools(void) +int nfsd_nrpools(struct net *net) { - if (nfsd_serv == NULL) + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) return 0; else - return nfsd_serv->sv_nrpools; + return nn->nfsd_serv->sv_nrpools; } -int nfsd_get_nrthreads(int n, int *nthreads) +int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - if (nfsd_serv != NULL) { - for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) - nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; + if (nn->nfsd_serv != NULL) { + for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; } return 0; } +void nfsd_destroy(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int destroy = (nn->nfsd_serv->sv_nrthreads == 1); + + if (destroy) + svc_shutdown_net(nn->nfsd_serv, net); + svc_destroy(nn->nfsd_serv); + if (destroy) + nn->nfsd_serv = NULL; +} + int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); WARN_ON(!mutex_is_locked(&nfsd_mutex)); - if (nfsd_serv == NULL || n <= 0) + if (nn->nfsd_serv == NULL || n <= 0) return 0; - if (n > nfsd_serv->sv_nrpools) - n = nfsd_serv->sv_nrpools; + if (n > nn->nfsd_serv->sv_nrpools) + n = nn->nfsd_serv->sv_nrpools; /* enforce a global maximum number of threads */ tot = 0; @@ -454,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) nthreads[0] = 1; /* apply the new numbers */ - svc_get(nfsd_serv); + svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], + err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) break; @@ -475,6 +494,7 @@ nfsd_svc(int nrservs, struct net *net) { int error; bool nfsd_up_before; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -483,7 +503,7 @@ nfsd_svc(int nrservs, struct net *net) if (nrservs > NFSD_MAXSERVS) nrservs = NFSD_MAXSERVS; error = 0; - if (nrservs == 0 && nfsd_serv == NULL) + if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; error = nfsd_create_serv(net); @@ -495,14 +515,14 @@ nfsd_svc(int nrservs, struct net *net) error = nfsd_startup(nrservs, net); if (error) goto out_destroy; - error = svc_set_num_threads(nfsd_serv, NULL, nrservs); + error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); if (error) goto out_shutdown; - /* We are holding a reference to nfsd_serv which + /* We are holding a reference to nn->nfsd_serv which * we don't want to count in the return value, * so subtract 1 */ - error = nfsd_serv->sv_nrthreads - 1; + error = nn->nfsd_serv->sv_nrthreads - 1; out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown(net); @@ -681,14 +701,17 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { int ret; + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + mutex_lock(&nfsd_mutex); - if (nfsd_serv == NULL) { + if (nn->nfsd_serv == NULL) { mutex_unlock(&nfsd_mutex); return -ENODEV; } /* bump up the psudo refcount while traversing */ - svc_get(nfsd_serv); - ret = svc_pool_stats_open(nfsd_serv, file); + svc_get(nn->nfsd_serv); + ret = svc_pool_stats_open(nn->nfsd_serv, file); mutex_unlock(&nfsd_mutex); return ret; } -- cgit v1.2.1 From bda9cac1db8ab044e9edbfe5730283016b67d451 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:29 +0300 Subject: nfsd: introduce helpers for generic resources init and shutdown NFSd have per-net resources and resources, used globally. Let's move generic resources init and shutdown to separated functions since they are going to be allocated on first NFSd service start and destroyed after last NFSd service shutdown. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 50 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 0e8622a4341c..f9d147f6dfd4 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -206,6 +206,37 @@ static int nfsd_init_socks(struct net *net) static bool nfsd_up = false; +static int nfsd_startup_generic(int nrservs) +{ + int ret; + + if (nfsd_up) + return 0; + + /* + * Readahead param cache - will no-op if it already exists. + * (Note therefore results will be suboptimal if number of + * threads is modified after nfsd start.) + */ + ret = nfsd_racache_init(2*nrservs); + if (ret) + return ret; + ret = nfs4_state_start(); + if (ret) + goto out_racache; + return 0; + +out_racache: + nfsd_racache_shutdown(); + return ret; +} + +static void nfsd_shutdown_generic(void) +{ + nfs4_state_shutdown(); + nfsd_racache_shutdown(); +} + static int nfsd_startup_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -236,19 +267,9 @@ static int nfsd_startup(int nrservs, struct net *net) { int ret; - if (nfsd_up) - return 0; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); + ret = nfsd_startup_generic(nrservs); if (ret) return ret; - ret = nfs4_state_start(); - if (ret) - goto out_racache; ret = nfsd_startup_net(net); if (ret) goto out_net; @@ -257,9 +278,7 @@ static int nfsd_startup(int nrservs, struct net *net) return 0; out_net: - nfs4_state_shutdown(); -out_racache: - nfsd_racache_shutdown(); + nfsd_shutdown_generic(); return ret; } @@ -286,8 +305,7 @@ static void nfsd_shutdown(struct net *net) if (!nfsd_up) return; nfsd_shutdown_net(net); - nfs4_state_shutdown(); - nfsd_racache_shutdown(); + nfsd_shutdown_generic(); nfsd_up = false; } -- cgit v1.2.1 From 903d9bf0edebc9d9f06df125ab2bd57b4aa4e78e Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:34 +0300 Subject: nfsd: simplify NFSv4 state init and shutdown This patch moves nfsd_startup_generic() and nfsd_shutdown_generic() calls to nfsd_startup_net() and nfsd_shutdown_net() respectively, which allows us to call nfsd_startup_net() instead of nfsd_startup() and makes the code look clearer. It also modifies nfsd_svc() and nfsd_shutdown() to check nn->nfsd_net_up instead of global nfsd_up. The latter is now used only for generic resources shutdown and is currently useless. It will replaced by NFSd users counter later in this series. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f9d147f6dfd4..0c87b4e7d1b5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -235,9 +235,10 @@ static void nfsd_shutdown_generic(void) { nfs4_state_shutdown(); nfsd_racache_shutdown(); + nfsd_up = false; } -static int nfsd_startup_net(struct net *net) +static int nfsd_startup_net(int nrservs, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; @@ -245,39 +246,26 @@ static int nfsd_startup_net(struct net *net) if (nn->nfsd_net_up) return 0; - ret = nfsd_init_socks(net); + ret = nfsd_startup_generic(nrservs); if (ret) return ret; + ret = nfsd_init_socks(net); + if (ret) + goto out_socks; ret = lockd_up(net); if (ret) - return ret; + goto out_socks; ret = nfs4_state_start_net(net); if (ret) goto out_lockd; nn->nfsd_net_up = true; + nfsd_up = true; return 0; out_lockd: lockd_down(net); - return ret; -} - -static int nfsd_startup(int nrservs, struct net *net) -{ - int ret; - - ret = nfsd_startup_generic(nrservs); - if (ret) - return ret; - ret = nfsd_startup_net(net); - if (ret) - goto out_net; - - nfsd_up = true; - return 0; - -out_net: +out_socks: nfsd_shutdown_generic(); return ret; } @@ -286,27 +274,25 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - if (!nn->nfsd_net_up) - return; - nfs4_state_shutdown_net(net); lockd_down(net); nn->nfsd_net_up = false; + nfsd_shutdown_generic(); } static void nfsd_shutdown(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_last_thread will be run before any of this * other initialization has been done. */ - if (!nfsd_up) + if (!nn->nfsd_net_up) return; nfsd_shutdown_net(net); - nfsd_shutdown_generic(); - nfsd_up = false; } static void nfsd_last_thread(struct svc_serv *serv, struct net *net) @@ -528,9 +514,9 @@ nfsd_svc(int nrservs, struct net *net) if (error) goto out; - nfsd_up_before = nfsd_up; + nfsd_up_before = nn->nfsd_net_up; - error = nfsd_startup(nrservs, net); + error = nfsd_startup_net(nrservs, net); if (error) goto out_destroy; error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); -- cgit v1.2.1 From 4539f14981ce02d48b212786a41c8bcfb62851b4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:39 +0300 Subject: nfsd: replace boolean nfsd_up flag by users counter Since we have generic NFSd resurces, we have to introduce some way how to allocate and destroy those resources on first per-net NFSd start and on last per-net NFSd stop respectively. This patch replaces global boolean nfsd_up flag (which is unused now) by users counter and use it to determine either we need to allocate generic resources or destroy them. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 0c87b4e7d1b5..5bb4a33211c7 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -204,13 +204,13 @@ static int nfsd_init_socks(struct net *net) return 0; } -static bool nfsd_up = false; +static int nfsd_users = 0; static int nfsd_startup_generic(int nrservs) { int ret; - if (nfsd_up) + if (nfsd_users++) return 0; /* @@ -233,9 +233,11 @@ out_racache: static void nfsd_shutdown_generic(void) { + if (--nfsd_users) + return; + nfs4_state_shutdown(); nfsd_racache_shutdown(); - nfsd_up = false; } static int nfsd_startup_net(int nrservs, struct net *net) @@ -260,7 +262,6 @@ static int nfsd_startup_net(int nrservs, struct net *net) goto out_lockd; nn->nfsd_net_up = true; - nfsd_up = true; return 0; out_lockd: -- cgit v1.2.1 From 541e864f00d0062c98c1e743265b0a60cada3755 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 14:23:44 +0300 Subject: nfsd: simplify service shutdown Function nfsd_shutdown is called from two places: nfsd_last_thread (when last kernel thread is exiting) and nfsd_svc (in case of kthreads starting error). When calling from nfsd_svc(), we can be sure that per-net resources are allocated, so we don't need to check per-net nfsd_net_up boolean flag. This allows us to remove nfsd_shutdown function at all and move check for per-net nfsd_net_up boolean flag to nfsd_last_thread. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 5bb4a33211c7..2cfd9c69503e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -281,7 +281,7 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } -static void nfsd_shutdown(struct net *net) +static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -294,11 +294,6 @@ static void nfsd_shutdown(struct net *net) if (!nn->nfsd_net_up) return; nfsd_shutdown_net(net); -} - -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) -{ - nfsd_shutdown(net); svc_rpcb_cleanup(serv, net); @@ -530,7 +525,7 @@ nfsd_svc(int nrservs, struct net *net) error = nn->nfsd_serv->sv_nrthreads - 1; out_shutdown: if (error < 0 && !nfsd_up_before) - nfsd_shutdown(net); + nfsd_shutdown_net(net); out_destroy: nfsd_destroy(net); /* Release server */ out: -- cgit v1.2.1 From 88c47666171989ed4c5b1a5687df09511e8c5e35 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 6 Dec 2012 18:34:42 +0300 Subject: nfsd: pass proper net to nfsd_destroy() from NFSd kthreads Since NFSd service is per-net now, we have to pass proper network context in nfsd_shutdown() from NFSd kthreads. The simplest way I found is to get proper net from one of transports with permanent sockets. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2cfd9c69503e..cee62ab9d4a3 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -541,6 +541,8 @@ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; + struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); + struct net *net = perm_sock->xpt_net; int err; /* Lock module and set up kernel thread */ @@ -605,7 +607,7 @@ out: /* Release the thread */ svc_exit_thread(rqstp); - nfsd_destroy(&init_net); + nfsd_destroy(net); /* Release module */ mutex_unlock(&nfsd_mutex); -- cgit v1.2.1 From 939da1084458246d2e29dd921c2012c177000e96 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 16:30:43 -0500 Subject: ext4: Remove CONFIG_EXT4_FS_XATTR Ted has sent out a RFC about removing this feature. Eric and Jan confirmed that both RedHat and SUSE enable this feature in all their product. David also said that "As far as I know, it's enabled in all Android kernels that use ext4." So it seems OK for us. And what's more, as inline data depends its implementation on xattr, and to be frank, I don't run any test again inline data enabled while xattr disabled. So I think we should add inline data and remove this config option in the same release. [ The savings if you disable CONFIG_EXT4_FS_XATTR is only 27k, which isn't much in the grand scheme of things. Since no one seems to be testing this configuration except for some automated compile farms, on balance we are better removing this config option, and so that it is effectively always enabled. -- tytso ] Cc: David Brown Cc: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/Kconfig | 4 +- fs/ext4/Kconfig | 15 ---- fs/ext4/Makefile | 4 +- fs/ext4/ext4.h | 2 - fs/ext4/file.c | 2 - fs/ext4/namei.c | 4 - fs/ext4/super.c | 9 --- fs/ext4/symlink.c | 4 - fs/ext4/xattr.h | 235 ------------------------------------------------------ 9 files changed, 4 insertions(+), 275 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index f95ae3a027f3..eaff24a19502 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -28,8 +28,8 @@ config FS_MBCACHE tristate default y if EXT2_FS=y && EXT2_FS_XATTR default y if EXT3_FS=y && EXT3_FS_XATTR - default y if EXT4_FS=y && EXT4_FS_XATTR - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT4_FS=y + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index c22f17021b6e..0a475c881852 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23 compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 3d96d5698538..0310fec2ee3d 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o inline.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b90e2720b826..e20dc38858d4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -848,7 +848,6 @@ struct ext4_inode_info { #endif unsigned long i_flags; -#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -857,7 +856,6 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; -#endif struct list_head i_orphan; /* unlinked but open inodes */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2f5759eb9f89..b64a60bf105a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -656,12 +656,10 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b37c21839833..cac448282331 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3228,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e1e216f8e9bd..7d53adff8bd3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -997,9 +997,7 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); -#endif init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); } @@ -1449,13 +1447,8 @@ static const struct mount_opts { {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, -#ifdef CONFIG_EXT4_FS_XATTR {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, -#else - {Opt_user_xattr, 0, MOPT_NOSUPPORT}, - {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, @@ -3368,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ -#ifdef CONFIG_EXT4_FS_XATTR set_opt(sb, XATTR_USER); -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ed9354aff279..ff3711932018 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 4222388c772f..7b5513ed3b38 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -92,8 +92,6 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; -# ifdef CONFIG_EXT4_FS_XATTR - extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_acl_access_handler; @@ -193,239 +191,6 @@ extern int ext4_try_to_evict_inline_data(handle_t *handle, extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); -# else /* CONFIG_EXT4_FS_XATTR */ - -static inline int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext4_xattr_put_super(struct super_block *sb) -{ -} - -static __init inline int -ext4_init_xattr(void) -{ - return 0; -} - -static inline void -ext4_exit_xattr(void) -{ -} - -static inline int -ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - return -EOPNOTSUPP; -} - -#define ext4_xattr_handlers NULL - -static inline int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_ibody_get(struct inode *inode, int name_index, - const char *name, - void *buffer, size_t buffer_size) -{ - return -EOPNOTSUPP; -} - -static inline int ext4_find_inline_data_nolock(struct inode *inode) -{ - return 0; -} - -static inline int ext4_has_inline_data(struct inode *inode) -{ - return 0; -} - -static inline int ext4_get_inline_size(struct inode *inode) -{ - return 0; -} - -static inline int ext4_get_max_inline_size(struct inode *inode) -{ - return 0; -} - -static inline void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len) -{ - return; -} - -static inline int ext4_init_inline_data(handle_t *handle, - struct inode *inode, - unsigned int len) -{ - return 0; -} - -static inline int ext4_destroy_inline_data(handle_t *handle, - struct inode *inode) -{ - return 0; -} - -static inline int ext4_readpage_inline(struct inode *inode, struct page *page) -{ - return 0; -} - -static inline int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep) -{ - return 0; -} - -static inline int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page) -{ - return 0; -} - -static inline struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - return NULL; -} - -static inline int -ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - return 0; -} - -static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - return 0; -} - -static inline int ext4_try_add_inline_entry(handle_t *handle, - struct dentry *dentry, - struct inode *inode) -{ - return 0; -} - -static inline int ext4_try_create_inline_dir(handle_t *handle, - struct inode *parent, - struct inode *inode) -{ - return 0; -} -static inline int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data) -{ - return 0; -} - -static inline struct buffer_head * -ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data) -{ - return NULL; -} -static inline int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data) -{ - return 0; -} - -static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) -{ - return 0; -} - -static inline struct buffer_head * -ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval) -{ - return NULL; -} - -static inline int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline) -{ - return 0; -} - -static inline void ext4_inline_data_truncate(struct inode *inode, - int *has_inline) -{ - return; -} - -static inline int ext4_convert_inline_data(struct inode *inode) -{ - return 0; -} -# endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, -- cgit v1.2.1 From 9a4c8019471386c6fb039ae9e30f5216b6b55a9e Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 10 Dec 2012 16:30:45 -0500 Subject: ext4: ensure Inode flags consistency are checked at build time Flags being used by atomic operations in inode flags (e.g. ext4_test_inode_flag(), should be consistent with that actually stored in inodes, i.e.: EXT4_XXX_FL. It ensures that this consistency is checked at build-time, not at run-time. Currently, the flags consistency are being checked at run-time, but, there is no real reason to not do a build-time check instead of a run-time check. The code is comparing macro defined values with enum type variables, where both are constants, so, there is no problem in comparing constants at build-time. enum variables are treated as constants by the C compiler, according to the C99 specs (see www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf sec. 6.2.5, item 16), so, there is no real problem in comparing an enumeration type at build time Signed-off-by: Carlos Maiolino Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 29 +++++++++++++---------------- fs/ext4/super.c | 1 + 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e20dc38858d4..b79d613091d0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -463,25 +463,22 @@ enum { EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7d53adff8bd3..3cdb0a2fc648 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5278,6 +5278,7 @@ static int __init ext4_init_fs(void) ext4_li_info = NULL; mutex_init(&ext4_li_mtx); + /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { -- cgit v1.2.1 From 0a5c33e23c4d781ecc815002c54f1f91012c703d Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 7 Dec 2012 16:17:28 -0500 Subject: NFSD: Pass correct buffer size to rpc_ntop I honestly have no idea where I got 129 from, but it's a much bigger value than the actual buffer size (INET6_ADDRSTRLEN). Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 2 +- fs/nfsd/nfs4state.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 96ffdf55dcec..7a7b079fbdb1 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -79,7 +79,7 @@ static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op, clp = nfsd_find_client(addr, addr_size); if (clp) { count = op->forget(clp, 0); - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count); } nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3d27f08e2297..8e2555112966 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4623,7 +4623,7 @@ u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) u64 nfsd_print_client(struct nfs4_client *clp, u64 num) { char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); printk(KERN_INFO "NFS Client: %s\n", buf); return 1; } @@ -4632,7 +4632,7 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, const char *type) { char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, 129); + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); } -- cgit v1.2.1 From 18d9a2ca2ea1aa963a077fb49e7efcc3b0237a9b Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 7 Dec 2012 16:17:29 -0500 Subject: NFSD: Correct the size calculation in fault_inject_write If len == 0 we end up with size = (0 - 1), which could cause bad things to happen in copy_from_user(). Signed-off-by: Bryan Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/fault_inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 7a7b079fbdb1..e761ee95617f 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -122,7 +122,7 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { char write_buf[INET6_ADDRSTRLEN]; - size_t size = min(sizeof(write_buf), len) - 1; + size_t size = min(sizeof(write_buf) - 1, len); struct net *net = current->nsproxy->net_ns; struct sockaddr_storage sa; u64 val; -- cgit v1.2.1 From 39a53e0ce0df01b3cf4bb898c7ae2fd2189647d5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Nov 2012 13:37:31 +0900 Subject: f2fs: add superblock and major in-memory structure This adds the following major in-memory structures in f2fs. - f2fs_sb_info: contains f2fs-specific information, two special inode pointers for node and meta address spaces, and orphan inode management. - f2fs_inode_info: contains vfs_inode and other fs-specific information. - f2fs_nm_info: contains node manager information such as NAT entry cache, free nid list, and NAT page management. - f2fs_node_info: represents a node as node id, inode number, block address, and its version. - f2fs_sm_info: contains segment manager information such as SIT entry cache, free segment map, current active logs, dirty segment management, and segment utilization. The specific structures are sit_info, free_segmap_info, dirty_seglist_info, curseg_info. In addition, add F2FS_SUPER_MAGIC in magic.h. Signed-off-by: Chul Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1062 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/node.h | 353 ++++++++++++++++++ fs/f2fs/segment.h | 615 +++++++++++++++++++++++++++++++ 3 files changed, 2030 insertions(+) create mode 100644 fs/f2fs/f2fs.h create mode 100644 fs/f2fs/node.h create mode 100644 fs/f2fs/segment.h (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h new file mode 100644 index 000000000000..7aa70b54172d --- /dev/null +++ b/fs/f2fs/f2fs.h @@ -0,0 +1,1062 @@ +/** + * fs/f2fs/f2fs.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_F2FS_H +#define _LINUX_F2FS_H + +#include +#include +#include +#include +#include +#include +#include + +/* + * For mount options + */ +#define F2FS_MOUNT_BG_GC 0x00000001 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 +#define F2FS_MOUNT_DISCARD 0x00000004 +#define F2FS_MOUNT_NOHEAP 0x00000008 +#define F2FS_MOUNT_XATTR_USER 0x00000010 +#define F2FS_MOUNT_POSIX_ACL 0x00000020 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 + +#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) + +#define ver_after(a, b) (typecheck(unsigned long long, a) && \ + typecheck(unsigned long long, b) && \ + ((long long)((a) - (b)) > 0)) + +typedef u64 block_t; +typedef u32 nid_t; + +struct f2fs_mount_info { + unsigned int opt; +}; + +static inline __u32 f2fs_crc32(void *buff, size_t len) +{ + return crc32_le(F2FS_SUPER_MAGIC, buff, len); +} + +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) +{ + return f2fs_crc32(buff, buff_size) == blk_crc; +} + +/* + * For checkpoint manager + */ +enum { + NAT_BITMAP, + SIT_BITMAP +}; + +/* for the list of orphan inodes */ +struct orphan_inode_entry { + struct list_head list; /* list head */ + nid_t ino; /* inode number */ +}; + +/* for the list of directory inodes */ +struct dir_inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ +}; + +/* for the list of fsync inodes, used only during recovery */ +struct fsync_inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ + block_t blkaddr; /* block address locating the last inode */ +}; + +#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) +#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) + +#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) +#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) +#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) +#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) + +static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +{ + int before = nats_in_cursum(rs); + rs->n_nats = cpu_to_le16(before + i); + return before; +} + +static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +{ + int before = sits_in_cursum(rs); + rs->n_sits = cpu_to_le16(before + i); + return before; +} + +/* + * For INODE and NODE manager + */ +#define XATTR_NODE_OFFSET (-1) /* + * store xattrs to one node block per + * file keeping -1 as its node offset to + * distinguish from index node blocks. + */ +#define RDONLY_NODE 1 /* + * specify a read-only mode when getting + * a node block. 0 is read-write mode. + * used by get_dnode_of_data(). + */ +#define F2FS_LINK_MAX 32000 /* maximum link count per file */ + +/* for in-memory extent cache entry */ +struct extent_info { + rwlock_t ext_lock; /* rwlock for consistency */ + unsigned int fofs; /* start offset in a file */ + u32 blk_addr; /* start block address of the extent */ + unsigned int len; /* lenth of the extent */ +}; + +/* + * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. + */ +#define FADVISE_COLD_BIT 0x01 + +struct f2fs_inode_info { + struct inode vfs_inode; /* serve a vfs inode */ + unsigned long i_flags; /* keep an inode flags for ioctl */ + unsigned char i_advise; /* use to give file attribute hints */ + unsigned int i_current_depth; /* use only in directory structure */ + umode_t i_acl_mode; /* keep file acl mode temporarily */ + + /* Use below internally in f2fs*/ + unsigned long flags; /* use to pass per-file flags */ + unsigned long long data_version;/* lastes version of data for fsync */ + atomic_t dirty_dents; /* # of dirty dentry pages */ + f2fs_hash_t chash; /* hash value of given file name */ + unsigned int clevel; /* maximum level of given file name */ + nid_t i_xattr_nid; /* node id that contains xattrs */ + struct extent_info ext; /* in-memory extent cache entry */ +}; + +static inline void get_extent_info(struct extent_info *ext, + struct f2fs_extent i_ext) +{ + write_lock(&ext->ext_lock); + ext->fofs = le32_to_cpu(i_ext.fofs); + ext->blk_addr = le32_to_cpu(i_ext.blk_addr); + ext->len = le32_to_cpu(i_ext.len); + write_unlock(&ext->ext_lock); +} + +static inline void set_raw_extent(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + read_lock(&ext->ext_lock); + i_ext->fofs = cpu_to_le32(ext->fofs); + i_ext->blk_addr = cpu_to_le32(ext->blk_addr); + i_ext->len = cpu_to_le32(ext->len); + read_unlock(&ext->ext_lock); +} + +struct f2fs_nm_info { + block_t nat_blkaddr; /* base disk address of NAT */ + nid_t max_nid; /* maximum possible node ids */ + nid_t init_scan_nid; /* the first nid to be scanned */ + nid_t next_scan_nid; /* the next nid to be scanned */ + + /* NAT cache management */ + struct radix_tree_root nat_root;/* root of the nat entry cache */ + rwlock_t nat_tree_lock; /* protect nat_tree_lock */ + unsigned int nat_cnt; /* the # of cached nat entries */ + struct list_head nat_entries; /* cached nat entry list (clean) */ + struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + + /* free node ids management */ + struct list_head free_nid_list; /* a list for free nids */ + spinlock_t free_nid_list_lock; /* protect free nid list */ + unsigned int fcnt; /* the number of free node id */ + struct mutex build_lock; /* lock for build free nids */ + + /* for checkpoint */ + char *nat_bitmap; /* NAT bitmap pointer */ + int bitmap_size; /* bitmap size */ +}; + +/* + * this structure is used as one of function parameters. + * all the information are dedicated to a given direct node block determined + * by the data offset in a file. + */ +struct dnode_of_data { + struct inode *inode; /* vfs inode pointer */ + struct page *inode_page; /* its inode page, NULL is possible */ + struct page *node_page; /* cached direct node page */ + nid_t nid; /* node id of the direct node block */ + unsigned int ofs_in_node; /* data offset in the node page */ + bool inode_page_locked; /* inode page is locked or not */ + block_t data_blkaddr; /* block address of the node block */ +}; + +static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, + struct page *ipage, struct page *npage, nid_t nid) +{ + dn->inode = inode; + dn->inode_page = ipage; + dn->node_page = npage; + dn->nid = nid; + dn->inode_page_locked = 0; +} + +/* + * For SIT manager + * + * By default, there are 6 active log areas across the whole main area. + * When considering hot and cold data separation to reduce cleaning overhead, + * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, + * respectively. + * In the current design, you should not change the numbers intentionally. + * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 + * logs individually according to the underlying devices. (default: 6) + * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for + * data and 8 for node logs. + */ +#define NR_CURSEG_DATA_TYPE (3) +#define NR_CURSEG_NODE_TYPE (3) +#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) + +enum { + CURSEG_HOT_DATA = 0, /* directory entry blocks */ + CURSEG_WARM_DATA, /* data blocks */ + CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ + CURSEG_HOT_NODE, /* direct node blocks of directory files */ + CURSEG_WARM_NODE, /* direct node blocks of normal files */ + CURSEG_COLD_NODE, /* indirect node blocks */ + NO_CHECK_TYPE +}; + +struct f2fs_sm_info { + struct sit_info *sit_info; /* whole segment information */ + struct free_segmap_info *free_info; /* free segment information */ + struct dirty_seglist_info *dirty_info; /* dirty segment information */ + struct curseg_info *curseg_array; /* active segment information */ + + struct list_head wblist_head; /* list of under-writeback pages */ + spinlock_t wblist_lock; /* lock for checkpoint */ + + block_t seg0_blkaddr; /* block address of 0'th segment */ + block_t main_blkaddr; /* start block address of main area */ + block_t ssa_blkaddr; /* start block address of SSA area */ + + unsigned int segment_count; /* total # of segments */ + unsigned int main_segments; /* # of segments in main area */ + unsigned int reserved_segments; /* # of reserved segments */ + unsigned int ovp_segments; /* # of overprovision segments */ +}; + +/* + * For directory operation + */ +#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1) +#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2) +#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3) +#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4) +#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5) + +/* + * For superblock + */ +/* + * COUNT_TYPE for monitoring + * + * f2fs monitors the number of several block types such as on-writeback, + * dirty dentry blocks, dirty node blocks, and dirty meta blocks. + */ +enum count_type { + F2FS_WRITEBACK, + F2FS_DIRTY_DENTS, + F2FS_DIRTY_NODES, + F2FS_DIRTY_META, + NR_COUNT_TYPE, +}; + +/* + * FS_LOCK nesting subclasses for the lock validator: + * + * The locking order between these classes is + * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW + * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC + */ +enum lock_type { + RENAME, /* for renaming operations */ + DENTRY_OPS, /* for directory operations */ + DATA_WRITE, /* for data write */ + DATA_NEW, /* for data allocation */ + DATA_TRUNC, /* for data truncate */ + NODE_NEW, /* for node allocation */ + NODE_TRUNC, /* for node truncate */ + NODE_WRITE, /* for node write */ + NR_LOCK_TYPE, +}; + +/* + * The below are the page types of bios used in submti_bio(). + * The available types are: + * DATA User data pages. It operates as async mode. + * NODE Node pages. It operates as async mode. + * META FS metadata pages such as SIT, NAT, CP. + * NR_PAGE_TYPE The number of page types. + * META_FLUSH Make sure the previous pages are written + * with waiting the bio's completion + * ... Only can be used with META. + */ +enum page_type { + DATA, + NODE, + META, + NR_PAGE_TYPE, + META_FLUSH, +}; + +struct f2fs_sb_info { + struct super_block *sb; /* pointer to VFS super block */ + struct buffer_head *raw_super_buf; /* buffer head of raw sb */ + struct f2fs_super_block *raw_super; /* raw super block pointer */ + int s_dirty; /* dirty flag for checkpoint */ + + /* for node-related operations */ + struct f2fs_nm_info *nm_info; /* node manager */ + struct inode *node_inode; /* cache node blocks */ + + /* for segment-related operations */ + struct f2fs_sm_info *sm_info; /* segment manager */ + struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ + sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ + struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for checkpoint */ + struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + struct inode *meta_inode; /* cache meta blocks */ + struct mutex cp_mutex; /* for checkpoint procedure */ + struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */ + struct mutex write_inode; /* mutex for write inode */ + struct mutex writepages; /* mutex for writepages() */ + int por_doing; /* recovery is doing or not */ + + /* for orphan inode management */ + struct list_head orphan_inode_list; /* orphan inode list */ + struct mutex orphan_inode_mutex; /* for orphan inode list */ + unsigned int n_orphans; /* # of orphan inodes */ + + /* for directory inode management */ + struct list_head dir_inode_list; /* dir inode list */ + spinlock_t dir_inode_lock; /* for dir inode list lock */ + unsigned int n_dirty_dirs; /* # of dir inodes */ + + /* basic file system units */ + unsigned int log_sectors_per_block; /* log2 sectors per block */ + unsigned int log_blocksize; /* log2 block size */ + unsigned int blocksize; /* block size */ + unsigned int root_ino_num; /* root inode number*/ + unsigned int node_ino_num; /* node inode number*/ + unsigned int meta_ino_num; /* meta inode number*/ + unsigned int log_blocks_per_seg; /* log2 blocks per segment */ + unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int segs_per_sec; /* segments per section */ + unsigned int secs_per_zone; /* sections per zone */ + unsigned int total_sections; /* total section count */ + unsigned int total_node_count; /* total node block count */ + unsigned int total_valid_node_count; /* valid node block count */ + unsigned int total_valid_inode_count; /* valid inode count */ + int active_logs; /* # of active logs */ + + block_t user_block_count; /* # of user blocks */ + block_t total_valid_block_count; /* # of valid blocks */ + block_t alloc_valid_block_count; /* # of allocated blocks */ + block_t last_valid_block_count; /* for recovery */ + u32 s_next_generation; /* for NFS support */ + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + + struct f2fs_mount_info mount_opt; /* mount options */ + + /* for cleaning operations */ + struct mutex gc_mutex; /* mutex for GC */ + struct f2fs_gc_kthread *gc_thread; /* GC thread */ + + /* + * for stat information. + * one is for the LFS mode, and the other is for the SSR mode. + */ + struct f2fs_stat_info *stat_info; /* FS status information */ + unsigned int segment_count[2]; /* # of allocated segments */ + unsigned int block_count[2]; /* # of allocated blocks */ + unsigned int last_victim[2]; /* last victim segment # */ + int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + int bg_gc; /* background gc calls */ + spinlock_t stat_lock; /* lock for stat operations */ +}; + +/* + * Inline functions + */ +static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) +{ + return container_of(inode, struct f2fs_inode_info, vfs_inode); +} + +static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_super_block *)(sbi->raw_super); +} + +static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_checkpoint *)(sbi->ckpt); +} + +static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_nm_info *)(sbi->nm_info); +} + +static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_sm_info *)(sbi->sm_info); +} + +static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) +{ + return (struct sit_info *)(SM_I(sbi)->sit_info); +} + +static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) +{ + return (struct free_segmap_info *)(SM_I(sbi)->free_info); +} + +static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) +{ + return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); +} + +static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) +{ + sbi->s_dirty = 1; +} + +static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) +{ + sbi->s_dirty = 0; +} + +static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +{ + mutex_lock_nested(&sbi->fs_lock[t], t); +} + +static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +{ + mutex_unlock(&sbi->fs_lock[t]); +} + +/* + * Check whether the given nid is within node id range. + */ +static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + BUG_ON((nid >= NM_I(sbi)->max_nid)); +} + +#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 + +/* + * Check whether the inode has blocks or not + */ +static inline int F2FS_HAS_BLOCKS(struct inode *inode) +{ + if (F2FS_I(inode)->i_xattr_nid) + return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); + else + return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); +} + +static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, blkcnt_t count) +{ + block_t valid_block_count; + + spin_lock(&sbi->stat_lock); + valid_block_count = + sbi->total_valid_block_count + (block_t)count; + if (valid_block_count > sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return false; + } + inode->i_blocks += count; + sbi->total_valid_block_count = valid_block_count; + sbi->alloc_valid_block_count += (block_t)count; + spin_unlock(&sbi->stat_lock); + return true; +} + +static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, + blkcnt_t count) +{ + spin_lock(&sbi->stat_lock); + BUG_ON(sbi->total_valid_block_count < (block_t) count); + BUG_ON(inode->i_blocks < count); + inode->i_blocks -= count; + sbi->total_valid_block_count -= (block_t)count; + spin_unlock(&sbi->stat_lock); + return 0; +} + +static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_inc(&sbi->nr_pages[count_type]); + F2FS_SET_SB_DIRT(sbi); +} + +static inline void inode_inc_dirty_dents(struct inode *inode) +{ + atomic_inc(&F2FS_I(inode)->dirty_dents); +} + +static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_dec(&sbi->nr_pages[count_type]); +} + +static inline void inode_dec_dirty_dents(struct inode *inode) +{ + atomic_dec(&F2FS_I(inode)->dirty_dents); +} + +static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +{ + return atomic_read(&sbi->nr_pages[count_type]); +} + +static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) +{ + block_t ret; + spin_lock(&sbi->stat_lock); + ret = sbi->total_valid_block_count; + spin_unlock(&sbi->stat_lock); + return ret; +} + +static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + /* return NAT or SIT bitmap */ + if (flag == NAT_BITMAP) + return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + else if (flag == SIT_BITMAP) + return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + + return 0; +} + +static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + int offset = (flag == NAT_BITMAP) ? ckpt->sit_ver_bitmap_bytesize : 0; + return &ckpt->sit_nat_version_bitmap + offset; +} + +static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + + start_addr = le64_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + /* + * odd numbered checkpoint should at cp segment 0 + * and even segent must be at cp segment 1 + */ + if (!(ckpt_version & 1)) + start_addr += sbi->blocks_per_seg; + + return start_addr; +} + +static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, + unsigned int count) +{ + block_t valid_block_count; + unsigned int valid_node_count; + + spin_lock(&sbi->stat_lock); + + valid_block_count = sbi->total_valid_block_count + (block_t)count; + sbi->alloc_valid_block_count += (block_t)count; + valid_node_count = sbi->total_valid_node_count + count; + + if (valid_block_count > sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return false; + } + + if (valid_node_count > sbi->total_node_count) { + spin_unlock(&sbi->stat_lock); + return false; + } + + if (inode) + inode->i_blocks += count; + sbi->total_valid_node_count = valid_node_count; + sbi->total_valid_block_count = valid_block_count; + spin_unlock(&sbi->stat_lock); + + return true; +} + +static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, + unsigned int count) +{ + spin_lock(&sbi->stat_lock); + + BUG_ON(sbi->total_valid_block_count < count); + BUG_ON(sbi->total_valid_node_count < count); + BUG_ON(inode->i_blocks < count); + + inode->i_blocks -= count; + sbi->total_valid_node_count -= count; + sbi->total_valid_block_count -= (block_t)count; + + spin_unlock(&sbi->stat_lock); +} + +static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) +{ + unsigned int ret; + spin_lock(&sbi->stat_lock); + ret = sbi->total_valid_node_count; + spin_unlock(&sbi->stat_lock); + return ret; +} + +static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) +{ + spin_lock(&sbi->stat_lock); + BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); + sbi->total_valid_inode_count++; + spin_unlock(&sbi->stat_lock); +} + +static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +{ + spin_lock(&sbi->stat_lock); + BUG_ON(!sbi->total_valid_inode_count); + sbi->total_valid_inode_count--; + spin_unlock(&sbi->stat_lock); + return 0; +} + +static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +{ + unsigned int ret; + spin_lock(&sbi->stat_lock); + ret = sbi->total_valid_inode_count; + spin_unlock(&sbi->stat_lock); + return ret; +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page || IS_ERR(page)) + return; + + if (unlock) { + BUG_ON(!PageLocked(page)); + unlock_page(page); + } + page_cache_release(page); +} + +static inline void f2fs_put_dnode(struct dnode_of_data *dn) +{ + if (dn->node_page) + f2fs_put_page(dn->node_page, 1); + if (dn->inode_page && dn->node_page != dn->inode_page) + f2fs_put_page(dn->inode_page, 0); + dn->node_page = NULL; + dn->inode_page = NULL; +} + +static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, + size_t size, void (*ctor)(void *)) +{ + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); +} + +#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) + +static inline bool IS_INODE(struct page *page) +{ + struct f2fs_node *p = (struct f2fs_node *)page_address(page); + return RAW_IS_INODE(p); +} + +static inline __le32 *blkaddr_in_node(struct f2fs_node *node) +{ + return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; +} + +static inline block_t datablock_addr(struct page *node_page, + unsigned int offset) +{ + struct f2fs_node *raw_node; + __le32 *addr_array; + raw_node = (struct f2fs_node *)page_address(node_page); + addr_array = blkaddr_in_node(raw_node); + return le32_to_cpu(addr_array[offset]); +} + +static inline int f2fs_test_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + return mask & *addr; +} + +static inline int f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr |= mask; + return ret; +} + +static inline int f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr &= ~mask; + return ret; +} + +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_NEED_CP, /* need to do checkpoint during fsync */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ +}; + +static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +{ + set_bit(flag, &fi->flags); +} + +static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +{ + return test_bit(flag, &fi->flags); +} + +static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +{ + clear_bit(flag, &fi->flags); +} + +static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +{ + fi->i_acl_mode = mode; + set_inode_flag(fi, FI_ACL_MODE); +} + +static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) +{ + if (is_inode_flag_set(fi, FI_ACL_MODE)) { + clear_inode_flag(fi, FI_ACL_MODE); + return 1; + } + return 0; +} + +/* + * file.c + */ +int f2fs_sync_file(struct file *, loff_t, loff_t, int); +void truncate_data_blocks(struct dnode_of_data *); +void f2fs_truncate(struct inode *); +int f2fs_setattr(struct dentry *, struct iattr *); +int truncate_hole(struct inode *, pgoff_t, pgoff_t); +long f2fs_ioctl(struct file *, unsigned int, unsigned long); + +/* + * inode.c + */ +void f2fs_set_inode_flags(struct inode *); +struct inode *f2fs_iget_nowait(struct super_block *, unsigned long); +struct inode *f2fs_iget(struct super_block *, unsigned long); +void update_inode(struct inode *, struct page *); +int f2fs_write_inode(struct inode *, struct writeback_control *); +void f2fs_evict_inode(struct inode *); + +/* + * namei.c + */ +struct dentry *f2fs_get_parent(struct dentry *child); + +/* + * dir.c + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, + struct page **); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); +ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, + struct page *, struct inode *); +void init_dent_inode(struct dentry *, struct page *); +int f2fs_add_link(struct dentry *, struct inode *); +void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_make_empty(struct inode *, struct inode *); +bool f2fs_empty_dir(struct inode *); + +/* + * super.c + */ +int f2fs_sync_fs(struct super_block *, int); + +/* + * hash.c + */ +f2fs_hash_t f2fs_dentry_hash(const char *, int); + +/* + * node.c + */ +struct dnode_of_data; +struct node_info; + +int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); +int truncate_inode_blocks(struct inode *, pgoff_t); +int remove_inode_page(struct inode *); +int new_inode_page(struct inode *, struct dentry *); +struct page *new_node_page(struct dnode_of_data *, unsigned int); +void ra_node_page(struct f2fs_sb_info *, nid_t); +struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_node_page_ra(struct page *, int); +void sync_inode_page(struct dnode_of_data *); +int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +bool alloc_nid(struct f2fs_sb_info *, nid_t *); +void alloc_nid_done(struct f2fs_sb_info *, nid_t); +void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +void recover_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_summary *, struct node_info *, block_t); +int recover_inode_page(struct f2fs_sb_info *, struct page *); +int restore_node_summary(struct f2fs_sb_info *, unsigned int, + struct f2fs_summary_block *); +void flush_nat_entries(struct f2fs_sb_info *); +int build_node_manager(struct f2fs_sb_info *); +void destroy_node_manager(struct f2fs_sb_info *); +int create_node_manager_caches(void); +void destroy_node_manager_caches(void); + +/* + * segment.c + */ +void f2fs_balance_fs(struct f2fs_sb_info *); +void invalidate_blocks(struct f2fs_sb_info *, block_t); +void locate_dirty_segment(struct f2fs_sb_info *, unsigned int); +void clear_prefree_segments(struct f2fs_sb_info *); +int npages_for_summary_flush(struct f2fs_sb_info *); +void allocate_new_segments(struct f2fs_sb_info *); +struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); +struct bio *f2fs_bio_alloc(struct block_device *, sector_t, int, gfp_t); +void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); +int write_meta_page(struct f2fs_sb_info *, struct page *, + struct writeback_control *); +void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, + block_t, block_t *); +void write_data_page(struct inode *, struct page *, struct dnode_of_data*, + block_t, block_t *); +void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void recover_data_page(struct f2fs_sb_info *, struct page *, + struct f2fs_summary *, block_t, block_t); +void rewrite_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_summary *, block_t, block_t); +void write_data_summaries(struct f2fs_sb_info *, block_t); +void write_node_summaries(struct f2fs_sb_info *, block_t); +int lookup_journal_in_cursum(struct f2fs_summary_block *, + int, unsigned int, int); +void flush_sit_entries(struct f2fs_sb_info *); +int build_segment_manager(struct f2fs_sb_info *); +void reset_victim_segmap(struct f2fs_sb_info *); +void destroy_segment_manager(struct f2fs_sb_info *); + +/* + * checkpoint.c + */ +struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +int check_orphan_space(struct f2fs_sb_info *); +void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void remove_orphan_inode(struct f2fs_sb_info *, nid_t); +int recover_orphan_inodes(struct f2fs_sb_info *); +int get_valid_checkpoint(struct f2fs_sb_info *); +void set_dirty_dir_page(struct inode *, struct page *); +void remove_dirty_dir_inode(struct inode *); +void sync_dirty_dir_inodes(struct f2fs_sb_info *); +void block_operations(struct f2fs_sb_info *); +void write_checkpoint(struct f2fs_sb_info *, bool, bool); +void init_orphan_info(struct f2fs_sb_info *); +int create_checkpoint_caches(void); +void destroy_checkpoint_caches(void); + +/* + * data.c + */ +int reserve_new_block(struct dnode_of_data *); +void update_extent_cache(block_t, struct dnode_of_data *); +struct page *find_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_new_data_page(struct inode *, pgoff_t, bool); +int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); +int do_write_data_page(struct page *); + +/* + * gc.c + */ +int start_gc_thread(struct f2fs_sb_info *); +void stop_gc_thread(struct f2fs_sb_info *); +block_t start_bidx_of_node(unsigned int); +int f2fs_gc(struct f2fs_sb_info *, int); +void build_gc_manager(struct f2fs_sb_info *); +int create_gc_caches(void); +void destroy_gc_caches(void); + +/* + * recovery.c + */ +void recover_fsync_data(struct f2fs_sb_info *); +bool space_for_roll_forward(struct f2fs_sb_info *); + +/* + * debug.c + */ +#ifdef CONFIG_F2FS_STAT_FS +struct f2fs_stat_info { + struct list_head stat_list; + struct f2fs_sb_info *sbi; + struct mutex stat_lock; + int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; + int main_area_segs, main_area_sections, main_area_zones; + int hit_ext, total_ext; + int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int nats, sits, fnids; + int total_count, utilization; + int bg_gc; + unsigned int valid_count, valid_node_count, valid_inode_count; + unsigned int bimodal, avg_vblocks; + int util_free, util_valid, util_invalid; + int rsvd_segs, overp_segs; + int dirty_count, node_pages, meta_pages; + int prefree_count, call_count; + int tot_segs, node_segs, data_segs, free_segs, free_secs; + int tot_blks, data_blks, node_blks; + int curseg[NR_CURSEG_TYPE]; + int cursec[NR_CURSEG_TYPE]; + int curzone[NR_CURSEG_TYPE]; + + unsigned int segment_count[2]; + unsigned int block_count[2]; + unsigned base_mem, cache_mem; +}; + +#define stat_inc_call_count(si) ((si)->call_count++) + +#define stat_inc_seg_count(sbi, type) \ + do { \ + struct f2fs_stat_info *si = sbi->stat_info; \ + (si)->tot_segs++; \ + if (type == SUM_TYPE_DATA) \ + si->data_segs++; \ + else \ + si->node_segs++; \ + } while (0) + +#define stat_inc_tot_blk_count(si, blks) \ + (si->tot_blks += (blks)) + +#define stat_inc_data_blk_count(sbi, blks) \ + do { \ + struct f2fs_stat_info *si = sbi->stat_info; \ + stat_inc_tot_blk_count(si, blks); \ + si->data_blks += (blks); \ + } while (0) + +#define stat_inc_node_blk_count(sbi, blks) \ + do { \ + struct f2fs_stat_info *si = sbi->stat_info; \ + stat_inc_tot_blk_count(si, blks); \ + si->node_blks += (blks); \ + } while (0) + +int f2fs_build_stats(struct f2fs_sb_info *); +void f2fs_destroy_stats(struct f2fs_sb_info *); +void destroy_root_stats(void); +#else +#define stat_inc_call_count(si) +#define stat_inc_seg_count(si, type) +#define stat_inc_tot_blk_count(si, blks) +#define stat_inc_data_blk_count(si, blks) +#define stat_inc_node_blk_count(sbi, blks) + +static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } +static inline void destroy_root_stats(void) { } +#endif + +extern const struct file_operations f2fs_dir_operations; +extern const struct file_operations f2fs_file_operations; +extern const struct inode_operations f2fs_file_inode_operations; +extern const struct address_space_operations f2fs_dblock_aops; +extern const struct address_space_operations f2fs_node_aops; +extern const struct address_space_operations f2fs_meta_aops; +extern const struct inode_operations f2fs_dir_inode_operations; +extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_special_inode_operations; +#endif diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h new file mode 100644 index 000000000000..5d525ed312ba --- /dev/null +++ b/fs/f2fs/node.h @@ -0,0 +1,353 @@ +/** + * fs/f2fs/node.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* start node id of a node block dedicated to the given node id */ +#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) + +/* node block offset on the NAT area dedicated to the given start node id */ +#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform readahead before building free nids */ +#define FREE_NID_PAGES 4 + +/* maximum # of free node ids to produce during build_free_nids */ +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) + +/* maximum readahead size for node during getting data blocks */ +#define MAX_RA_NODE 128 + +/* maximum cached nat entries to manage memory footprint */ +#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) + +/* vector size for gang look-up from nat cache that consists of radix tree */ +#define NATVEC_SIZE 64 + +/* + * For node information + */ +struct node_info { + nid_t nid; /* node id */ + nid_t ino; /* inode number of the node's owner */ + block_t blk_addr; /* block address of the node */ + unsigned char version; /* version of the node */ +}; + +struct nat_entry { + struct list_head list; /* for clean or dirty nat list */ + bool checkpointed; /* whether it is checkpointed or not */ + struct node_info ni; /* in-memory node information */ +}; + +#define nat_get_nid(nat) (nat->ni.nid) +#define nat_set_nid(nat, n) (nat->ni.nid = n) +#define nat_get_blkaddr(nat) (nat->ni.blk_addr) +#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) +#define nat_get_ino(nat) (nat->ni.ino) +#define nat_set_ino(nat, i) (nat->ni.ino = i) +#define nat_get_version(nat) (nat->ni.version) +#define nat_set_version(nat, v) (nat->ni.version = v) + +#define __set_nat_cache_dirty(nm_i, ne) \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); +#define __clear_nat_cache_dirty(nm_i, ne) \ + list_move_tail(&ne->list, &nm_i->nat_entries); +#define inc_node_version(version) (++version) + +static inline void node_info_from_raw_nat(struct node_info *ni, + struct f2fs_nat_entry *raw_ne) +{ + ni->ino = le32_to_cpu(raw_ne->ino); + ni->blk_addr = le32_to_cpu(raw_ne->block_addr); + ni->version = raw_ne->version; +} + +/* + * For free nid mangement + */ +enum nid_state { + NID_NEW, /* newly added to free nid list */ + NID_ALLOC /* it is allocated */ +}; + +struct free_nid { + struct list_head list; /* for free node id list */ + nid_t nid; /* node id */ + int state; /* in use or not: NID_NEW or NID_ALLOC */ +}; + +static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *fnid; + + if (nm_i->fcnt <= 0) + return -1; + spin_lock(&nm_i->free_nid_list_lock); + fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + *nid = fnid->nid; + spin_unlock(&nm_i->free_nid_list_lock); + return 0; +} + +/* + * inline functions + */ +static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); +} + +static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + pgoff_t block_off; + pgoff_t block_addr; + int seg_off; + + block_off = NAT_BLOCK_OFFSET(start); + seg_off = block_off >> sbi->log_blocks_per_seg; + + block_addr = (pgoff_t)(nm_i->nat_blkaddr + + (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) + block_addr += sbi->blocks_per_seg; + + return block_addr; +} + +static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + block_addr -= nm_i->nat_blkaddr; + if ((block_addr >> sbi->log_blocks_per_seg) % 2) + block_addr -= sbi->blocks_per_seg; + else + block_addr += sbi->blocks_per_seg; + + return block_addr + nm_i->nat_blkaddr; +} + +static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) +{ + unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); + + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) + f2fs_clear_bit(block_off, nm_i->nat_bitmap); + else + f2fs_set_bit(block_off, nm_i->nat_bitmap); +} + +static inline void fill_node_footer(struct page *page, nid_t nid, + nid_t ino, unsigned int ofs, bool reset) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + if (reset) + memset(rn, 0, sizeof(*rn)); + rn->footer.nid = cpu_to_le32(nid); + rn->footer.ino = cpu_to_le32(ino); + rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); +} + +static inline void copy_node_footer(struct page *dst, struct page *src) +{ + void *src_addr = page_address(src); + void *dst_addr = page_address(dst); + struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; + struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; + memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); +} + +static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + rn->footer.cp_ver = ckpt->checkpoint_ver; + rn->footer.next_blkaddr = blkaddr; +} + +static inline nid_t ino_of_node(struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline unsigned long long cpver_of_node(struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + return le32_to_cpu(rn->footer.next_blkaddr); +} + +/* + * f2fs assigns the following node offsets described as (num). + * N = NIDS_PER_BLOCK + * + * Inode block (0) + * |- direct node (1) + * |- direct node (2) + * |- indirect node (3) + * | `- direct node (4 => 4 + N - 1) + * |- indirect node (4 + N) + * | `- direct node (5 + N => 5 + 2N - 1) + * `- double indirect node (5 + 2N) + * `- indirect node (6 + 2N) + * `- direct node (x(N + 1)) + */ +static inline bool IS_DNODE(struct page *node_page) +{ + unsigned int ofs = ofs_of_node(node_page); + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || + ofs == 5 + 2 * NIDS_PER_BLOCK) + return false; + if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { + ofs -= 6 + 2 * NIDS_PER_BLOCK; + if ((long int)ofs % (NIDS_PER_BLOCK + 1)) + return false; + } + return true; +} + +static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +{ + struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + + wait_on_page_writeback(p); + + if (i) + rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); + else + rn->in.nid[off] = cpu_to_le32(nid); + set_page_dirty(p); +} + +static inline nid_t get_nid(struct page *p, int off, bool i) +{ + struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + if (i) + return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); + return le32_to_cpu(rn->in.nid[off]); +} + +/* + * Coldness identification: + * - Mark cold files in f2fs_inode_info + * - Mark cold node blocks in their node footer + * - Mark cold data pages in page cache + */ +static inline int is_cold_file(struct inode *inode) +{ + return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; +} + +static inline int is_cold_data(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_cold_data(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_cold_data(struct page *page) +{ + ClearPageChecked(page); +} + +static inline int is_cold_node(struct page *page) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned int flag = le32_to_cpu(rn->footer.flag); + return flag & (0x1 << COLD_BIT_SHIFT); +} + +static inline unsigned char is_fsync_dnode(struct page *page) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned int flag = le32_to_cpu(rn->footer.flag); + return flag & (0x1 << FSYNC_BIT_SHIFT); +} + +static inline unsigned char is_dent_dnode(struct page *page) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned int flag = le32_to_cpu(rn->footer.flag); + return flag & (0x1 << DENT_BIT_SHIFT); +} + +static inline void set_cold_node(struct inode *inode, struct page *page) +{ + struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + + if (S_ISDIR(inode->i_mode)) + flag &= ~(0x1 << COLD_BIT_SHIFT); + else + flag |= (0x1 << COLD_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_fsync_mark(struct page *page, int mark) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) + flag |= (0x1 << FSYNC_BIT_SHIFT); + else + flag &= ~(0x1 << FSYNC_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_dentry_mark(struct page *page, int mark) +{ + void *kaddr = page_address(page); + struct f2fs_node *rn = (struct f2fs_node *)kaddr; + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) + flag |= (0x1 << DENT_BIT_SHIFT); + else + flag &= ~(0x1 << DENT_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h new file mode 100644 index 000000000000..e380a8ef13f5 --- /dev/null +++ b/fs/f2fs/segment.h @@ -0,0 +1,615 @@ +/** + * fs/f2fs/segment.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* constant macro */ +#define NULL_SEGNO ((unsigned int)(~0)) + +/* V: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) +#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) + +#define IS_DATASEG(t) \ + ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ + (t == CURSEG_WARM_DATA)) + +#define IS_NODESEG(t) \ + ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ + (t == CURSEG_WARM_NODE)) + +#define IS_CURSEG(sbi, segno) \ + ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + +#define IS_CURSEC(sbi, secno) \ + ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + sbi->segs_per_sec) || \ + (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + sbi->segs_per_sec)) \ + +#define START_BLOCK(sbi, segno) \ + (SM_I(sbi)->seg0_blkaddr + \ + (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) +#define NEXT_FREE_BLKADDR(sbi, curseg) \ + (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + +#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \ + ((blk_addr) - SM_I(sbi)->seg0_blkaddr) +#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_SEGNO(sbi, blk_addr) \ + (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ + GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#define GET_SECNO(sbi, segno) \ + ((segno) / sbi->segs_per_sec) +#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ + ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) + +#define GET_SUM_BLOCK(sbi, segno) \ + ((sbi->sm_info->ssa_blkaddr) + segno) + +#define GET_SUM_TYPE(footer) ((footer)->entry_type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) + +#define SIT_ENTRY_OFFSET(sit_i, segno) \ + (segno % sit_i->sents_per_block) +#define SIT_BLOCK_OFFSET(sit_i, segno) \ + (segno / SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(sit_i, segno) \ + (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define f2fs_bitmap_size(nr) \ + (BITS_TO_LONGS(nr) * sizeof(unsigned long)) +#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) + +/* during checkpoint, bio_private is used to synchronize the last bio */ +struct bio_private { + struct f2fs_sb_info *sbi; + bool is_sync; + void *wait; +}; + +/* + * indicate a block allocation direction: RIGHT and LEFT. + * RIGHT means allocating new sections towards the end of volume. + * LEFT means the opposite direction. + */ +enum { + ALLOC_RIGHT = 0, + ALLOC_LEFT +}; + +/* + * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * LFS writes data sequentially with cleaning operations. + * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + */ +enum { + LFS = 0, + SSR +}; + +/* + * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * GC_CB is based on cost-benefit algorithm. + * GC_GREEDY is based on greedy algorithm. + */ +enum { + GC_CB = 0, + GC_GREEDY +}; + +/* + * BG_GC means the background cleaning job. + * FG_GC means the on-demand cleaning job. + */ +enum { + BG_GC = 0, + FG_GC +}; + +/* for a function parameter to select a victim segment */ +struct victim_sel_policy { + int alloc_mode; /* LFS or SSR */ + int gc_mode; /* GC_CB or GC_GREEDY */ + unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int offset; /* last scanned bitmap offset */ + unsigned int ofs_unit; /* bitmap search unit */ + unsigned int min_cost; /* minimum cost */ + unsigned int min_segno; /* segment # having min. cost */ +}; + +struct seg_entry { + unsigned short valid_blocks; /* # of valid blocks */ + unsigned char *cur_valid_map; /* validity bitmap of blocks */ + /* + * # of valid blocks and the validity bitmap stored in the the last + * checkpoint pack. This information is used by the SSR mode. + */ + unsigned short ckpt_valid_blocks; + unsigned char *ckpt_valid_map; + unsigned char type; /* segment type like CURSEG_XXX_TYPE */ + unsigned long long mtime; /* modification time of the segment */ +}; + +struct sec_entry { + unsigned int valid_blocks; /* # of valid blocks in a section */ +}; + +struct segment_allocation { + void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +}; + +struct sit_info { + const struct segment_allocation *s_ops; + + block_t sit_base_addr; /* start block address of SIT area */ + block_t sit_blocks; /* # of blocks used by SIT area */ + block_t written_valid_blocks; /* # of valid blocks in main area */ + char *sit_bitmap; /* SIT bitmap pointer */ + unsigned int bitmap_size; /* SIT bitmap size */ + + unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ + unsigned int dirty_sentries; /* # of dirty sentries */ + unsigned int sents_per_block; /* # of SIT entries per block */ + struct mutex sentry_lock; /* to protect SIT cache */ + struct seg_entry *sentries; /* SIT segment-level cache */ + struct sec_entry *sec_entries; /* SIT section-level cache */ + + /* for cost-benefit algorithm in cleaning procedure */ + unsigned long long elapsed_time; /* elapsed time after mount */ + unsigned long long mounted_time; /* mount time */ + unsigned long long min_mtime; /* min. modification time */ + unsigned long long max_mtime; /* max. modification time */ +}; + +struct free_segmap_info { + unsigned int start_segno; /* start segment number logically */ + unsigned int free_segments; /* # of free segments */ + unsigned int free_sections; /* # of free sections */ + rwlock_t segmap_lock; /* free segmap lock */ + unsigned long *free_segmap; /* free segment bitmap */ + unsigned long *free_secmap; /* free section bitmap */ +}; + +/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ +enum dirty_type { + DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ + DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ + DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ + DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ + DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ + DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ + DIRTY, /* to count # of dirty segments */ + PRE, /* to count # of entirely obsolete segments */ + NR_DIRTY_TYPE +}; + +struct dirty_seglist_info { + const struct victim_selection *v_ops; /* victim selction operation */ + unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + struct mutex seglist_lock; /* lock for segment bitmaps */ + int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ + unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */ +}; + +/* victim selection function for cleaning and SSR */ +struct victim_selection { + int (*get_victim)(struct f2fs_sb_info *, unsigned int *, + int, int, char); +}; + +/* for active log information */ +struct curseg_info { + struct mutex curseg_mutex; /* lock for consistency */ + struct f2fs_summary_block *sum_blk; /* cached summary block */ + unsigned char alloc_type; /* current allocation type */ + unsigned int segno; /* current segment number */ + unsigned short next_blkoff; /* next block offset to write */ + unsigned int zone; /* current zone number */ + unsigned int next_segno; /* preallocated segment */ +}; + +/* + * inline functions + */ +static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) +{ + return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); +} + +static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sentries[segno]; +} + +static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; +} + +static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, int section) +{ + /* + * In order to get # of valid blocks in a section instantly from many + * segments, f2fs manages two counting structures separately. + */ + if (section > 1) + return get_sec_entry(sbi, segno)->valid_blocks; + else + return get_seg_entry(sbi, segno)->valid_blocks; +} + +static inline void seg_info_from_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + se->valid_blocks = GET_SIT_VBLOCKS(rs); + se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); + memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->type = GET_SIT_TYPE(rs); + se->mtime = le64_to_cpu(rs->mtime); +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | + se->valid_blocks; + rs->vblocks = cpu_to_le16(raw_vblocks); + memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->ckpt_valid_blocks = se->valid_blocks; + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, + unsigned int max, unsigned int segno) +{ + unsigned int ret; + read_lock(&free_i->segmap_lock); + ret = find_next_bit(free_i->free_segmap, max, segno); + read_unlock(&free_i->segmap_lock); + return ret; +} + +static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int next; + + write_lock(&free_i->segmap_lock); + clear_bit(segno, free_i->free_segmap); + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + clear_bit(secno, free_i->free_secmap); + free_i->free_sections++; + } + write_unlock(&free_i->segmap_lock); +} + +static inline void __set_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + set_bit(segno, free_i->free_segmap); + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; +} + +static inline void __set_test_and_free(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int next; + + write_lock(&free_i->segmap_lock); + if (test_and_clear_bit(segno, free_i->free_segmap)) { + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), + start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + if (test_and_clear_bit(secno, free_i->free_secmap)) + free_i->free_sections++; + } + } + write_unlock(&free_i->segmap_lock); +} + +static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = segno / sbi->segs_per_sec; + write_lock(&free_i->segmap_lock); + if (!test_and_set_bit(segno, free_i->free_segmap)) { + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; + } + write_unlock(&free_i->segmap_lock); +} + +static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, + void *dst_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); +} + +static inline block_t written_block_count(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + block_t vblocks; + + mutex_lock(&sit_i->sentry_lock); + vblocks = sit_i->written_valid_blocks; + mutex_unlock(&sit_i->sentry_lock); + + return vblocks; +} + +static inline unsigned int free_segments(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int free_segs; + + read_lock(&free_i->segmap_lock); + free_segs = free_i->free_segments; + read_unlock(&free_i->segmap_lock); + + return free_segs; +} + +static inline int reserved_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->reserved_segments; +} + +static inline unsigned int free_sections(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int free_secs; + + read_lock(&free_i->segmap_lock); + free_secs = free_i->free_sections; + read_unlock(&free_i->segmap_lock); + + return free_secs; +} + +static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[PRE]; +} + +static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; +} + +static inline int overprovision_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ovp_segments; +} + +static inline int overprovision_sections(struct f2fs_sb_info *sbi) +{ + return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; +} + +static inline int reserved_sections(struct f2fs_sb_info *sbi) +{ + return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; +} + +static inline bool need_SSR(struct f2fs_sb_info *sbi) +{ + return (free_sections(sbi) < overprovision_sections(sbi)); +} + +static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return DIRTY_I(sbi)->v_ops->get_victim(sbi, + &(curseg)->next_segno, BG_GC, type, SSR); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi) +{ + return free_sections(sbi) <= reserved_sections(sbi); +} + +static inline int utilization(struct f2fs_sb_info *sbi) +{ + return (long int)valid_user_blocks(sbi) * 100 / + (long int)sbi->user_block_count; +} + +/* + * Sometimes f2fs may be better to drop out-of-place update policy. + * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write + * data in the original place likewise other traditional file systems. + * But, currently set 100 in percentage, which means it is disabled. + * See below need_inplace_update(). + */ +#define MIN_IPU_UTIL 100 +static inline bool need_inplace_update(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + if (S_ISDIR(inode->i_mode)) + return false; + if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) + return true; + return false; +} + +static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->segno; +} + +static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->alloc_type; +} + +static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->next_blkoff; +} + +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + unsigned int end_segno = SM_I(sbi)->segment_count - 1; + BUG_ON(segno > end_segno); +} + +/* + * This function is used for only debugging. + * NOTE: In future, we have to remove this function. + */ +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; + block_t start_addr = sm_info->seg0_blkaddr; + block_t end_addr = start_addr + total_blks - 1; + BUG_ON(blk_addr < start_addr); + BUG_ON(blk_addr > end_addr); +} + +/* + * Summary block is always treated as invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + unsigned int end_segno = sm_info->segment_count - 1; + int valid_blocks = 0; + int i; + + /* check segment usage */ + BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); + + /* check boundary of a given segment number */ + BUG_ON(segno > end_segno); + + /* check bitmap with valid block count */ + for (i = 0; i < sbi->blocks_per_seg; i++) + if (f2fs_test_bit(i, raw_sit->valid_map)) + valid_blocks++; + BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); +} + +static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); + block_t blk_addr = sit_i->sit_base_addr + offset; + + check_seg_range(sbi, start); + + /* calculate sit block address */ + if (f2fs_test_bit(offset, sit_i->sit_bitmap)) + blk_addr += sit_i->sit_blocks; + + return blk_addr; +} + +static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + block_addr -= sit_i->sit_base_addr; + if (block_addr < sit_i->sit_blocks) + block_addr += sit_i->sit_blocks; + else + block_addr -= sit_i->sit_blocks; + + return block_addr + sit_i->sit_base_addr; +} + +static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) +{ + unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); + + if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) + f2fs_clear_bit(block_off, sit_i->sit_bitmap); + else + f2fs_set_bit(block_off, sit_i->sit_bitmap); +} + +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - + sit_i->mounted_time; +} + +static inline void set_summary(struct f2fs_summary *sum, nid_t nid, + unsigned int ofs_in_node, unsigned char version) +{ + sum->nid = cpu_to_le32(nid); + sum->ofs_in_node = cpu_to_le16(ofs_in_node); + sum->version = version; +} + +static inline block_t start_sum_block(struct f2fs_sb_info *sbi) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) + - (base + 1) + type; +} -- cgit v1.2.1 From aff063e266cbf4754021d8e5d16ee418560906fd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:07:47 +0900 Subject: f2fs: add super block operations This adds the implementation of superblock operations for f2fs, which includes - init_f2fs_fs/exit_f2fs_fs - f2fs_mount - super_operations of f2fs Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 656 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 656 insertions(+) create mode 100644 fs/f2fs/super.c (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c new file mode 100644 index 000000000000..8661c93538af --- /dev/null +++ b/fs/f2fs/super.c @@ -0,0 +1,656 @@ +/** + * fs/f2fs/super.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "xattr.h" + +static struct kmem_cache *f2fs_inode_cachep; + +enum { + Opt_gc_background_off, + Opt_disable_roll_forward, + Opt_discard, + Opt_noheap, + Opt_nouser_xattr, + Opt_noacl, + Opt_active_logs, + Opt_disable_ext_identify, + Opt_err, +}; + +static match_table_t f2fs_tokens = { + {Opt_gc_background_off, "background_gc_off"}, + {Opt_disable_roll_forward, "disable_roll_forward"}, + {Opt_discard, "discard"}, + {Opt_noheap, "no_heap"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_noacl, "noacl"}, + {Opt_active_logs, "active_logs=%u"}, + {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_err, NULL}, +}; + +static void init_once(void *foo) +{ + struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; + + memset(fi, 0, sizeof(*fi)); + inode_init_once(&fi->vfs_inode); +} + +static struct inode *f2fs_alloc_inode(struct super_block *sb) +{ + struct f2fs_inode_info *fi; + + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + if (!fi) + return NULL; + + init_once((void *) fi); + + /* Initilize f2fs-specific inode info */ + fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_dents, 0); + fi->i_current_depth = 1; + fi->i_advise = 0; + rwlock_init(&fi->ext.ext_lock); + + set_inode_flag(fi, FI_NEW_INODE); + + return &fi->vfs_inode; +} + +static void f2fs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); +} + +void f2fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, f2fs_i_callback); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + f2fs_destroy_stats(sbi); + stop_gc_thread(sbi); + + write_checkpoint(sbi, false, true); + + iput(sbi->node_inode); + iput(sbi->meta_inode); + + /* destroy f2fs internal modules */ + destroy_node_manager(sbi); + destroy_segment_manager(sbi); + + kfree(sbi->ckpt); + + sb->s_fs_info = NULL; + brelse(sbi->raw_super_buf); + kfree(sbi); +} + +int f2fs_sync_fs(struct super_block *sb, int sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret = 0; + + if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) + return 0; + + if (sync) + write_checkpoint(sbi, false, false); + + return ret; +} + +static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + block_t total_count, user_block_count, start_count, ovp_count; + + total_count = le64_to_cpu(sbi->raw_super->block_count); + user_block_count = sbi->user_block_count; + start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + buf->f_type = F2FS_SUPER_MAGIC; + buf->f_bsize = sbi->blocksize; + + buf->f_blocks = total_count - start_count; + buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bavail = user_block_count - valid_user_blocks(sbi); + + buf->f_files = valid_inode_count(sbi); + buf->f_ffree = sbi->total_node_count - valid_node_count(sbi); + + buf->f_namelen = F2FS_MAX_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int f2fs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); + + if (test_opt(sbi, BG_GC)) + seq_puts(seq, ",background_gc_on"); + else + seq_puts(seq, ",background_gc_off"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, DISCARD)) + seq_puts(seq, ",discard"); + if (test_opt(sbi, NOHEAP)) + seq_puts(seq, ",no_heap_alloc"); +#ifdef CONFIG_F2FS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif + if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) + seq_puts(seq, ",disable_ext_indentify"); + + seq_printf(seq, ",active_logs=%u", sbi->active_logs); + + return 0; +} + +static struct super_operations f2fs_sops = { + .alloc_inode = f2fs_alloc_inode, + .destroy_inode = f2fs_destroy_inode, + .write_inode = f2fs_write_inode, + .show_options = f2fs_show_options, + .evict_inode = f2fs_evict_inode, + .put_super = f2fs_put_super, + .sync_fs = f2fs_sync_fs, + .statfs = f2fs_statfs, +}; + +static struct inode *f2fs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + + if (ino < F2FS_ROOT_INO(sbi)) + return ERR_PTR(-ESTALE); + + /* + * f2fs_iget isn't quite right if the inode is currently unallocated! + * However f2fs_iget currently does appropriate checks to handle stale + * inodes so everything is OK. + */ + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static const struct export_operations f2fs_export_ops = { + .fh_to_dentry = f2fs_fh_to_dentry, + .fh_to_parent = f2fs_fh_to_parent, + .get_parent = f2fs_get_parent, +}; + +static int parse_options(struct f2fs_sb_info *sbi, char *options) +{ + substring_t args[MAX_OPT_ARGS]; + char *p; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background_off: + clear_opt(sbi, BG_GC); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; +#else + case Opt_nouser_xattr: + pr_info("nouser_xattr options not supported\n"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_noacl: + pr_info("noacl options not supported\n"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != 6) + return -EINVAL; + sbi->active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + default: + return -EINVAL; + } + } + return 0; +} + +static loff_t max_file_size(unsigned bits) +{ + loff_t result = ADDRS_PER_INODE; + loff_t leaf_count = ADDRS_PER_BLOCK; + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + result <<= bits; + return result; +} + +static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +{ + unsigned int blocksize; + + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + return 1; + + /* Currently, support only 4KB block size */ + blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); + if (blocksize != PAGE_CACHE_SIZE) + return 1; + if (le32_to_cpu(raw_super->log_sectorsize) != + F2FS_LOG_SECTOR_SIZE) + return 1; + if (le32_to_cpu(raw_super->log_sectors_per_block) != + F2FS_LOG_SECTORS_PER_BLOCK) + return 1; + return 0; +} + +static int sanity_check_ckpt(struct f2fs_super_block *raw_super, + struct f2fs_checkpoint *ckpt) +{ + unsigned int total, fsmeta; + + total = le32_to_cpu(raw_super->segment_count); + fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); + fsmeta += le32_to_cpu(raw_super->segment_count_sit); + fsmeta += le32_to_cpu(raw_super->segment_count_nat); + fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); + fsmeta += le32_to_cpu(raw_super->segment_count_ssa); + + if (fsmeta >= total) + return 1; + return 0; +} + +static void init_sb_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = sbi->raw_super; + int i; + + sbi->log_sectors_per_block = + le32_to_cpu(raw_super->log_sectors_per_block); + sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); + sbi->blocksize = 1 << sbi->log_blocksize; + sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + sbi->total_sections = le32_to_cpu(raw_super->section_count); + sbi->total_node_count = + (le32_to_cpu(raw_super->segment_count_nat) / 2) + * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; + sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); + sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); + sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); +} + +static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct f2fs_sb_info *sbi; + struct f2fs_super_block *raw_super; + struct buffer_head *raw_super_buf; + struct inode *root; + long err = -EINVAL; + int i; + + /* allocate memory for f2fs-specific super block info */ + sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + /* set a temporary block size */ + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) + goto free_sbi; + + /* read f2fs raw super block */ + raw_super_buf = sb_bread(sb, 0); + if (!raw_super_buf) { + err = -EIO; + goto free_sbi; + } + raw_super = (struct f2fs_super_block *) + ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET); + + /* init some FS parameters */ + sbi->active_logs = NR_CURSEG_TYPE; + + set_opt(sbi, BG_GC); + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + /* parse mount options */ + if (parse_options(sbi, (char *)data)) + goto free_sb_buf; + + /* sanity checking of raw super */ + if (sanity_check_raw_super(raw_super)) + goto free_sb_buf; + + sb->s_maxbytes = max_file_size(raw_super->log_blocksize); + sb->s_max_links = F2FS_LINK_MAX; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + + sb->s_op = &f2fs_sops; + sb->s_xattr = f2fs_xattr_handlers; + sb->s_export_op = &f2fs_export_ops; + sb->s_magic = F2FS_SUPER_MAGIC; + sb->s_fs_info = sbi; + sb->s_time_gran = 1; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + + /* init f2fs-specific super block info */ + sbi->sb = sb; + sbi->raw_super = raw_super; + sbi->raw_super_buf = raw_super_buf; + mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->write_inode); + mutex_init(&sbi->writepages); + mutex_init(&sbi->cp_mutex); + for (i = 0; i < NR_LOCK_TYPE; i++) + mutex_init(&sbi->fs_lock[i]); + sbi->por_doing = 0; + spin_lock_init(&sbi->stat_lock); + init_rwsem(&sbi->bio_sem); + init_sb_info(sbi); + + /* get an inode for meta space */ + sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); + if (IS_ERR(sbi->meta_inode)) { + err = PTR_ERR(sbi->meta_inode); + goto free_sb_buf; + } + + err = get_valid_checkpoint(sbi); + if (err) + goto free_meta_inode; + + /* sanity checking of checkpoint */ + err = -EINVAL; + if (sanity_check_ckpt(raw_super, sbi->ckpt)) + goto free_cp; + + sbi->total_valid_node_count = + le32_to_cpu(sbi->ckpt->valid_node_count); + sbi->total_valid_inode_count = + le32_to_cpu(sbi->ckpt->valid_inode_count); + sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); + sbi->total_valid_block_count = + le64_to_cpu(sbi->ckpt->valid_block_count); + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->alloc_valid_block_count = 0; + INIT_LIST_HEAD(&sbi->dir_inode_list); + spin_lock_init(&sbi->dir_inode_lock); + + /* init super block */ + if (!sb_set_blocksize(sb, sbi->blocksize)) + goto free_cp; + + init_orphan_info(sbi); + + /* setup f2fs internal modules */ + err = build_segment_manager(sbi); + if (err) + goto free_sm; + err = build_node_manager(sbi); + if (err) + goto free_nm; + + build_gc_manager(sbi); + + /* get an inode for node space */ + sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); + if (IS_ERR(sbi->node_inode)) { + err = PTR_ERR(sbi->node_inode); + goto free_nm; + } + + /* if there are nt orphan nodes free them */ + err = -EINVAL; + if (!(sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) && + recover_orphan_inodes(sbi)) + goto free_node_inode; + + /* read root inode and dentry */ + root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto free_node_inode; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) + goto free_root_inode; + + sb->s_root = d_make_root(root); /* allocate root dentry */ + if (!sb->s_root) { + err = -ENOMEM; + goto free_root_inode; + } + + /* recover fsynced data */ + if (!(sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) && + !test_opt(sbi, DISABLE_ROLL_FORWARD)) + recover_fsync_data(sbi); + + /* After POR, we can run background GC thread */ + err = start_gc_thread(sbi); + if (err) + goto fail; + + err = f2fs_build_stats(sbi); + if (err) + goto fail; + + return 0; +fail: + stop_gc_thread(sbi); +free_root_inode: + dput(sb->s_root); + sb->s_root = NULL; +free_node_inode: + iput(sbi->node_inode); +free_nm: + destroy_node_manager(sbi); +free_sm: + destroy_segment_manager(sbi); +free_cp: + kfree(sbi->ckpt); +free_meta_inode: + make_bad_inode(sbi->meta_inode); + iput(sbi->meta_inode); +free_sb_buf: + brelse(raw_super_buf); +free_sbi: + kfree(sbi); + return err; +} + +static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); +} + +static struct file_system_type f2fs_fs_type = { + .owner = THIS_MODULE, + .name = "f2fs", + .mount = f2fs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int init_inodecache(void) +{ + f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), NULL); + if (f2fs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(f2fs_inode_cachep); +} + +static int __init init_f2fs_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto fail; + err = create_node_manager_caches(); + if (err) + goto fail; + err = create_gc_caches(); + if (err) + goto fail; + err = create_checkpoint_caches(); + if (err) + goto fail; + return register_filesystem(&f2fs_fs_type); +fail: + return err; +} + +static void __exit exit_f2fs_fs(void) +{ + destroy_root_stats(); + unregister_filesystem(&f2fs_fs_type); + destroy_checkpoint_caches(); + destroy_gc_caches(); + destroy_node_manager_caches(); + destroy_inodecache(); +} + +module_init(init_f2fs_fs) +module_exit(exit_f2fs_fs) + +MODULE_AUTHOR("Samsung Electronics's Praesto Team"); +MODULE_DESCRIPTION("Flash Friendly File System"); +MODULE_LICENSE("GPL"); -- cgit v1.2.1 From 127e670abfa7fa150f6550d620ded930f5bdb4e7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:08:18 +0900 Subject: f2fs: add checkpoint operations This adds functions required by the checkpoint operations. Basically, f2fs adopts a roll-back model with checkpoint blocks written in the CP area. The checkpoint procedure includes as follows. - write_checkpoint() 1. block_operations() freezes VFS calls. 2. submit cached bios. 3. flush_nat_entries() writes NAT pages updated by dirty NAT entries. 4. flush_sit_entries() writes SIT pages updated by dirty SIT entries. 5. do_checkpoint() writes, - checkpoint block (#0) - orphan inode blocks - summary blocks made by active logs - checkpoint block (copy of #0) 6. unblock_opeations() In order to provide an address space for meta pages, f2fs_sb_info has a special inode, namely meta_inode. This patch also adds the address space operations for meta_inode. Signed-off-by: Chul Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 792 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 792 insertions(+) create mode 100644 fs/f2fs/checkpoint.c (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c new file mode 100644 index 000000000000..ab743f92ee06 --- /dev/null +++ b/fs/f2fs/checkpoint.c @@ -0,0 +1,792 @@ +/** + * fs/f2fs/checkpoint.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *inode_entry_slab; + +/** + * We guarantee no failure on the returned page. + */ +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = sbi->meta_inode->i_mapping; + struct page *page = NULL; +repeat: + page = grab_cache_page(mapping, index); + if (!page) { + cond_resched(); + goto repeat; + } + + /* We wait writeback only inside grab_meta_page() */ + wait_on_page_writeback(page); + SetPageUptodate(page); + return page; +} + +/** + * We guarantee no failure on the returned page. + */ +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = sbi->meta_inode->i_mapping; + struct page *page; +repeat: + page = grab_cache_page(mapping, index); + if (!page) { + cond_resched(); + goto repeat; + } + if (f2fs_readpage(sbi, page, index, READ_SYNC)) { + f2fs_put_page(page, 1); + goto repeat; + } + mark_page_accessed(page); + + /* We do not allow returning an errorneous page */ + return page; +} + +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int err; + + wait_on_page_writeback(page); + + err = write_meta_page(sbi, page, wbc); + if (err) { + wbc->pages_skipped++; + set_page_dirty(page); + } + + dec_page_count(sbi, F2FS_DIRTY_META); + + /* In this case, we should not unlock this page */ + if (err != AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); + return err; +} + +static int f2fs_write_meta_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct block_device *bdev = sbi->sb->s_bdev; + long written; + + if (wbc->for_kupdate) + return 0; + + if (get_pages(sbi, F2FS_DIRTY_META) == 0) + return 0; + + /* if mounting is failed, skip writing node pages */ + mutex_lock(&sbi->cp_mutex); + written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); + mutex_unlock(&sbi->cp_mutex); + wbc->nr_to_write -= written; + return 0; +} + +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write) +{ + struct address_space *mapping = sbi->meta_inode->i_mapping; + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + long nwritten = 0; + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + pagevec_init(&pvec, 0); + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + lock_page(page); + BUG_ON(page->mapping != mapping); + BUG_ON(!PageDirty(page)); + clear_page_dirty_for_io(page); + f2fs_write_meta_page(page, &wbc); + if (nwritten++ >= nr_to_write) + break; + } + pagevec_release(&pvec); + cond_resched(); + } + + if (nwritten) + f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + + return nwritten; +} + +static int f2fs_set_meta_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(sbi, F2FS_DIRTY_META); + F2FS_SET_SB_DIRT(sbi); + return 1; + } + return 0; +} + +const struct address_space_operations f2fs_meta_aops = { + .writepage = f2fs_write_meta_page, + .writepages = f2fs_write_meta_pages, + .set_page_dirty = f2fs_set_meta_page_dirty, +}; + +int check_orphan_space(struct f2fs_sb_info *sbi) +{ + unsigned int max_orphans; + int err = 0; + + /* + * considering 512 blocks in a segment 5 blocks are needed for cp + * and log segment summaries. Remaining blocks are used to keep + * orphan entries with the limitation one reserved segment + * for cp pack we can have max 1020*507 orphan entries + */ + max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; + mutex_lock(&sbi->orphan_inode_mutex); + if (sbi->n_orphans >= max_orphans) + err = -ENOSPC; + mutex_unlock(&sbi->orphan_inode_mutex); + return err; +} + +void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct list_head *head, *this; + struct orphan_inode_entry *new = NULL, *orphan = NULL; + + mutex_lock(&sbi->orphan_inode_mutex); + head = &sbi->orphan_inode_list; + list_for_each(this, head) { + orphan = list_entry(this, struct orphan_inode_entry, list); + if (orphan->ino == ino) + goto out; + if (orphan->ino > ino) + break; + orphan = NULL; + } +retry: + new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + if (!new) { + cond_resched(); + goto retry; + } + new->ino = ino; + INIT_LIST_HEAD(&new->list); + + /* add new_oentry into list which is sorted by inode number */ + if (orphan) { + struct orphan_inode_entry *prev; + + /* get previous entry */ + prev = list_entry(orphan->list.prev, typeof(*prev), list); + if (&prev->list != head) + /* insert new orphan inode entry */ + list_add(&new->list, &prev->list); + else + list_add(&new->list, head); + } else { + list_add_tail(&new->list, head); + } + sbi->n_orphans++; +out: + mutex_unlock(&sbi->orphan_inode_mutex); +} + +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct list_head *this, *next, *head; + struct orphan_inode_entry *orphan; + + mutex_lock(&sbi->orphan_inode_mutex); + head = &sbi->orphan_inode_list; + list_for_each_safe(this, next, head) { + orphan = list_entry(this, struct orphan_inode_entry, list); + if (orphan->ino == ino) { + list_del(&orphan->list); + kmem_cache_free(orphan_entry_slab, orphan); + sbi->n_orphans--; + break; + } + } + mutex_unlock(&sbi->orphan_inode_mutex); +} + +static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode = f2fs_iget(sbi->sb, ino); + BUG_ON(IS_ERR(inode)); + clear_nlink(inode); + + /* truncate all the data during iput */ + iput(inode); +} + +int recover_orphan_inodes(struct f2fs_sb_info *sbi) +{ + block_t start_blk, orphan_blkaddr, i, j; + + if (!(F2FS_CKPT(sbi)->ckpt_flags & CP_ORPHAN_PRESENT_FLAG)) + return 0; + + sbi->por_doing = 1; + start_blk = __start_cp_addr(sbi) + 1; + orphan_blkaddr = __start_sum_addr(sbi) - 1; + + for (i = 0; i < orphan_blkaddr; i++) { + struct page *page = get_meta_page(sbi, start_blk + i); + struct f2fs_orphan_block *orphan_blk; + + orphan_blk = (struct f2fs_orphan_block *)page_address(page); + for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { + nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + recover_orphan_inode(sbi, ino); + } + f2fs_put_page(page, 1); + } + /* clear Orphan Flag */ + F2FS_CKPT(sbi)->ckpt_flags &= (~CP_ORPHAN_PRESENT_FLAG); + sbi->por_doing = 0; + return 0; +} + +static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) +{ + struct list_head *head, *this, *next; + struct f2fs_orphan_block *orphan_blk = NULL; + struct page *page = NULL; + unsigned int nentries = 0; + unsigned short index = 1; + unsigned short orphan_blocks; + + orphan_blocks = (unsigned short)((sbi->n_orphans + + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + + mutex_lock(&sbi->orphan_inode_mutex); + head = &sbi->orphan_inode_list; + + /* loop for each orphan inode entry and write them in Jornal block */ + list_for_each_safe(this, next, head) { + struct orphan_inode_entry *orphan; + + orphan = list_entry(this, struct orphan_inode_entry, list); + + if (nentries == F2FS_ORPHANS_PER_BLOCK) { + /* + * an orphan block is full of 1020 entries, + * then we need to flush current orphan blocks + * and bring another one in memory + */ + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + index++; + start_blk++; + nentries = 0; + page = NULL; + } + if (page) + goto page_exist; + + page = grab_meta_page(sbi, start_blk); + orphan_blk = (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); +page_exist: + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + } + if (!page) + goto end; + + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); +end: + mutex_unlock(&sbi->orphan_inode_mutex); +} + +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1, *cp_page_2 = NULL; + unsigned long blk_size = sbi->blocksize; + struct f2fs_checkpoint *cp_block; + unsigned long long cur_version = 0, pre_version = 0; + unsigned int crc = 0; + size_t crc_offset; + + /* Read the 1st cp block in this CP pack */ + cp_page_1 = get_meta_page(sbi, cp_addr); + + /* get the version number */ + cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); + crc_offset = le32_to_cpu(cp_block->checksum_offset); + if (crc_offset >= blk_size) + goto invalid_cp1; + + crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + if (!f2fs_crc_valid(crc, cp_block, crc_offset)) + goto invalid_cp1; + + pre_version = le64_to_cpu(cp_block->checkpoint_ver); + + /* Read the 2nd cp block in this CP pack */ + cp_addr += le64_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_page_2 = get_meta_page(sbi, cp_addr); + + cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); + crc_offset = le32_to_cpu(cp_block->checksum_offset); + if (crc_offset >= blk_size) + goto invalid_cp2; + + crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + if (!f2fs_crc_valid(crc, cp_block, crc_offset)) + goto invalid_cp2; + + cur_version = le64_to_cpu(cp_block->checkpoint_ver); + + if (cur_version == pre_version) { + *version = cur_version; + f2fs_put_page(cp_page_2, 1); + return cp_page_1; + } +invalid_cp2: + f2fs_put_page(cp_page_2, 1); +invalid_cp1: + f2fs_put_page(cp_page_1, 1); + return NULL; +} + +int get_valid_checkpoint(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *cp_block; + struct f2fs_super_block *fsb = sbi->raw_super; + struct page *cp1, *cp2, *cur_page; + unsigned long blk_size = sbi->blocksize; + unsigned long long cp1_version = 0, cp2_version = 0; + unsigned long long cp_start_blk_no; + + sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); + if (!sbi->ckpt) + return -ENOMEM; + /* + * Finding out valid cp block involves read both + * sets( cp pack1 and cp pack 2) + */ + cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); + cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); + + /* The second checkpoint pack should start at the next segment */ + cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); + + if (cp1 && cp2) { + if (ver_after(cp2_version, cp1_version)) + cur_page = cp2; + else + cur_page = cp1; + } else if (cp1) { + cur_page = cp1; + } else if (cp2) { + cur_page = cp2; + } else { + goto fail_no_cp; + } + + cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + memcpy(sbi->ckpt, cp_block, blk_size); + + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); + return 0; + +fail_no_cp: + kfree(sbi->ckpt); + return -EINVAL; +} + +void set_dirty_dir_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct list_head *head = &sbi->dir_inode_list; + struct dir_inode_entry *new; + struct list_head *this; + + if (!S_ISDIR(inode->i_mode)) + return; +retry: + new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + if (!new) { + cond_resched(); + goto retry; + } + new->inode = inode; + INIT_LIST_HEAD(&new->list); + + spin_lock(&sbi->dir_inode_lock); + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode == inode) { + kmem_cache_free(inode_entry_slab, new); + goto out; + } + } + list_add_tail(&new->list, head); + sbi->n_dirty_dirs++; + + BUG_ON(!S_ISDIR(inode->i_mode)); +out: + inc_page_count(sbi, F2FS_DIRTY_DENTS); + inode_inc_dirty_dents(inode); + SetPagePrivate(page); + + spin_unlock(&sbi->dir_inode_lock); +} + +void remove_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct list_head *head = &sbi->dir_inode_list; + struct list_head *this; + + if (!S_ISDIR(inode->i_mode)) + return; + + spin_lock(&sbi->dir_inode_lock); + if (atomic_read(&F2FS_I(inode)->dirty_dents)) + goto out; + + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode == inode) { + list_del(&entry->list); + kmem_cache_free(inode_entry_slab, entry); + sbi->n_dirty_dirs--; + break; + } + } +out: + spin_unlock(&sbi->dir_inode_lock); +} + +void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->dir_inode_list; + struct dir_inode_entry *entry; + struct inode *inode; +retry: + spin_lock(&sbi->dir_inode_lock); + if (list_empty(head)) { + spin_unlock(&sbi->dir_inode_lock); + return; + } + entry = list_entry(head->next, struct dir_inode_entry, list); + inode = igrab(entry->inode); + spin_unlock(&sbi->dir_inode_lock); + if (inode) { + filemap_flush(inode->i_mapping); + iput(inode); + } else { + /* + * We should submit bio, since it exists several + * wribacking dentry pages in the freeing inode. + */ + f2fs_submit_bio(sbi, DATA, true); + } + goto retry; +} + +/** + * Freeze all the FS-operations for checkpoint. + */ +void block_operations(struct f2fs_sb_info *sbi) +{ + int t; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + + /* Stop renaming operation */ + mutex_lock_op(sbi, RENAME); + mutex_lock_op(sbi, DENTRY_OPS); + +retry_dents: + /* write all the dirty dentry pages */ + sync_dirty_dir_inodes(sbi); + + mutex_lock_op(sbi, DATA_WRITE); + if (get_pages(sbi, F2FS_DIRTY_DENTS)) { + mutex_unlock_op(sbi, DATA_WRITE); + goto retry_dents; + } + + /* block all the operations */ + for (t = DATA_NEW; t <= NODE_TRUNC; t++) + mutex_lock_op(sbi, t); + + mutex_lock(&sbi->write_inode); + + /* + * POR: we should ensure that there is no dirty node pages + * until finishing nat/sit flush. + */ +retry: + sync_node_pages(sbi, 0, &wbc); + + mutex_lock_op(sbi, NODE_WRITE); + + if (get_pages(sbi, F2FS_DIRTY_NODES)) { + mutex_unlock_op(sbi, NODE_WRITE); + goto retry; + } + mutex_unlock(&sbi->write_inode); +} + +static void unblock_operations(struct f2fs_sb_info *sbi) +{ + int t; + for (t = NODE_WRITE; t >= RENAME; t--) + mutex_unlock_op(sbi, t); +} + +static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + nid_t last_nid = 0; + block_t start_blk; + struct page *cp_page; + unsigned int data_sum_blocks, orphan_blocks; + void *kaddr; + __u32 crc32 = 0; + int i; + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + + next_free_nid(sbi, &last_nid); + + /* + * modify checkpoint + * version number is already updated + */ + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); + for (i = 0; i < 3; i++) { + ckpt->cur_node_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); + ckpt->cur_node_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = + curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + } + for (i = 0; i < 3; i++) { + ckpt->cur_data_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); + ckpt->cur_data_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = + curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + } + + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); + + /* 2 cp + n data seg summary + orphan inode blocks */ + data_sum_blocks = npages_for_summary_flush(sbi); + if (data_sum_blocks < 3) + ckpt->ckpt_flags |= CP_COMPACT_SUM_FLAG; + else + ckpt->ckpt_flags &= (~CP_COMPACT_SUM_FLAG); + + orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) + / F2FS_ORPHANS_PER_BLOCK; + ckpt->cp_pack_start_sum = 1 + orphan_blocks; + ckpt->cp_pack_total_block_count = 2 + data_sum_blocks + orphan_blocks; + + if (is_umount) { + ckpt->ckpt_flags |= CP_UMOUNT_FLAG; + ckpt->cp_pack_total_block_count += NR_CURSEG_NODE_TYPE; + } else { + ckpt->ckpt_flags &= (~CP_UMOUNT_FLAG); + } + + if (sbi->n_orphans) + ckpt->ckpt_flags |= CP_ORPHAN_PRESENT_FLAG; + else + ckpt->ckpt_flags &= (~CP_ORPHAN_PRESENT_FLAG); + + /* update SIT/NAT bitmap */ + get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); + get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); + + crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + *(__u32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset)) + = cpu_to_le32(crc32); + + start_blk = __start_cp_addr(sbi); + + /* write out checkpoint buffer at block 0 */ + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + + if (sbi->n_orphans) { + write_orphan_inodes(sbi, start_blk); + start_blk += orphan_blocks; + } + + write_data_summaries(sbi, start_blk); + start_blk += data_sum_blocks; + if (is_umount) { + write_node_summaries(sbi, start_blk); + start_blk += NR_CURSEG_NODE_TYPE; + } + + /* writeout checkpoint block */ + cp_page = grab_meta_page(sbi, start_blk); + kaddr = page_address(cp_page); + memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + + /* wait for previous submitted node/meta pages writeback */ + while (get_pages(sbi, F2FS_WRITEBACK)) + congestion_wait(BLK_RW_ASYNC, HZ / 50); + + filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); + filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->alloc_valid_block_count = 0; + + /* Here, we only have one bio having CP pack */ + if (sbi->ckpt->ckpt_flags & CP_ERROR_FLAG) + sbi->sb->s_flags |= MS_RDONLY; + else + sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + + clear_prefree_segments(sbi); + F2FS_RESET_SB_DIRT(sbi); +} + +/** + * We guarantee that this checkpoint procedure should not fail. + */ +void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_ver; + + if (!blocked) { + mutex_lock(&sbi->cp_mutex); + block_operations(sbi); + } + + f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_bio(sbi, NODE, true); + f2fs_submit_bio(sbi, META, true); + + /* + * update checkpoint pack index + * Increase the version number so that + * SIT entries and seg summaries are written at correct place + */ + ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); + + /* write cached NAT/SIT entries to NAT/SIT area */ + flush_nat_entries(sbi); + flush_sit_entries(sbi); + + reset_victim_segmap(sbi); + + /* unlock all the fs_lock[] in do_checkpoint() */ + do_checkpoint(sbi, is_umount); + + unblock_operations(sbi); + mutex_unlock(&sbi->cp_mutex); +} + +void init_orphan_info(struct f2fs_sb_info *sbi) +{ + mutex_init(&sbi->orphan_inode_mutex); + INIT_LIST_HEAD(&sbi->orphan_inode_list); + sbi->n_orphans = 0; +} + +int create_checkpoint_caches(void) +{ + orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", + sizeof(struct orphan_inode_entry), NULL); + if (unlikely(!orphan_entry_slab)) + return -ENOMEM; + inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", + sizeof(struct dir_inode_entry), NULL); + if (unlikely(!inode_entry_slab)) { + kmem_cache_destroy(orphan_entry_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_checkpoint_caches(void) +{ + kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(inode_entry_slab); +} -- cgit v1.2.1 From e05df3b115e7308afbca652769b54e4549fcc723 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:08:50 +0900 Subject: f2fs: add node operations This adds specific functions to manage NAT pages, a cache for NAT entries, free nids, direct/indirect node blocks for indexing data, and address space for node pages. - The key information of an NAT entry consists of a node id and a block address. - An NAT page is composed of block addresses covered by a certain range of NAT entries, which is maintained by the address space of meta_inode. - A radix tree structure is used to cache NAT entries. The index for the tree is a node id. - When there is no free nid, F2FS should scan NAT entries to find new one. In order to avoid scanning frequently, F2FS manages a list containing a number of free nids in memory. Only when free nids in the list are exhausted, scanning process, build_free_nids(), is triggered. - F2FS has direct and indirect node blocks for indexing data. This patch adds fuctions related to the node block management such as getting, allocating, and truncating node blocks to index data. - In order to cache node blocks in memory, F2FS has a node_inode with an address space for node pages. This patch also adds the address space operations for node_inode. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1763 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1763 insertions(+) create mode 100644 fs/f2fs/node.c (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c new file mode 100644 index 000000000000..216f04dc1177 --- /dev/null +++ b/fs/f2fs/node.c @@ -0,0 +1,1763 @@ +/** + * fs/f2fs/node.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *nat_entry_slab; +static struct kmem_cache *free_nid_slab; + +static void clear_node_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + unsigned int long flags; + + if (PageDirty(page)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + clear_page_dirty_for_io(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + } + ClearPageUptodate(page); +} + +static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + pgoff_t index = current_nat_addr(sbi, nid); + return get_meta_page(sbi, index); +} + +static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *src_page; + struct page *dst_page; + pgoff_t src_off; + pgoff_t dst_off; + void *src_addr; + void *dst_addr; + struct f2fs_nm_info *nm_i = NM_I(sbi); + + src_off = current_nat_addr(sbi, nid); + dst_off = next_nat_addr(sbi, src_off); + + /* get current nat block page with lock */ + src_page = get_meta_page(sbi, src_off); + + /* Dirty src_page means that it is already the new target NAT page. */ + if (PageDirty(src_page)) + return src_page; + + dst_page = grab_meta_page(sbi, dst_off); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_nat(nm_i, nid); + + return dst_page; +} + +/** + * Readahead NAT pages + */ +static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) +{ + struct address_space *mapping = sbi->meta_inode->i_mapping; + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct page *page; + pgoff_t index; + int i; + + for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { + if (nid >= nm_i->max_nid) + nid = 0; + index = current_nat_addr(sbi, nid); + + page = grab_cache_page(mapping, index); + if (!page) + continue; + if (f2fs_readpage(sbi, page, index, READ)) { + f2fs_put_page(page, 1); + continue; + } + page_cache_release(page); + } +} + +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +{ + return radix_tree_lookup(&nm_i->nat_root, n); +} + +static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); +} + +static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) +{ + list_del(&e->list); + radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); + nm_i->nat_cnt--; + kmem_cache_free(nat_entry_slab, e); +} + +int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + int is_cp = 1; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e && !e->checkpointed) + is_cp = 0; + read_unlock(&nm_i->nat_tree_lock); + return is_cp; +} + +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +{ + struct nat_entry *new; + + new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + memset(new, 0, sizeof(struct nat_entry)); + nat_set_nid(new, nid); + list_add_tail(&new->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return new; +} + +static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + struct f2fs_nat_entry *ne) +{ + struct nat_entry *e; +retry: + write_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (!e) { + e = grab_nat_entry(nm_i, nid); + if (!e) { + write_unlock(&nm_i->nat_tree_lock); + goto retry; + } + nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); + nat_set_ino(e, le32_to_cpu(ne->ino)); + nat_set_version(e, ne->version); + e->checkpointed = true; + } + write_unlock(&nm_i->nat_tree_lock); +} + +static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + block_t new_blkaddr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; +retry: + write_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ni->nid); + if (!e) { + e = grab_nat_entry(nm_i, ni->nid); + if (!e) { + write_unlock(&nm_i->nat_tree_lock); + goto retry; + } + e->ni = *ni; + e->checkpointed = true; + BUG_ON(ni->blk_addr == NEW_ADDR); + } else if (new_blkaddr == NEW_ADDR) { + /* + * when nid is reallocated, + * previous nat entry can be remained in nat cache. + * So, reinitialize it with new information. + */ + e->ni = *ni; + BUG_ON(ni->blk_addr != NULL_ADDR); + } + + if (new_blkaddr == NEW_ADDR) + e->checkpointed = false; + + /* sanity check */ + BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); + BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && + new_blkaddr == NULL_ADDR); + BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && + new_blkaddr == NEW_ADDR); + BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && + nat_get_blkaddr(e) != NULL_ADDR && + new_blkaddr == NEW_ADDR); + + /* increament version no as node is removed */ + if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { + unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); + } + + /* change address */ + nat_set_blkaddr(e, new_blkaddr); + __set_nat_cache_dirty(nm_i, e); + write_unlock(&nm_i->nat_tree_lock); +} + +static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) + return 0; + + write_lock(&nm_i->nat_tree_lock); + while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + struct nat_entry *ne; + ne = list_first_entry(&nm_i->nat_entries, + struct nat_entry, list); + __del_from_nat_cache(nm_i, ne); + nr_shrink--; + } + write_unlock(&nm_i->nat_tree_lock); + return nr_shrink; +} + +/** + * This function returns always success + */ +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = START_NID(nid); + struct f2fs_nat_block *nat_blk; + struct page *page = NULL; + struct f2fs_nat_entry ne; + struct nat_entry *e; + int i; + + ni->nid = nid; + + /* Check nat cache */ + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + ni->ino = nat_get_ino(e); + ni->blk_addr = nat_get_blkaddr(e); + ni->version = nat_get_version(e); + } + read_unlock(&nm_i->nat_tree_lock); + if (e) + return; + + /* Check current segment summary */ + mutex_lock(&curseg->curseg_mutex); + i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + if (i >= 0) { + ne = nat_in_journal(sum, i); + node_info_from_raw_nat(ni, &ne); + } + mutex_unlock(&curseg->curseg_mutex); + if (i >= 0) + goto cache; + + /* Fill node_info from nat page */ + page = get_current_nat_page(sbi, start_nid); + nat_blk = (struct f2fs_nat_block *)page_address(page); + ne = nat_blk->entries[nid - start_nid]; + node_info_from_raw_nat(ni, &ne); + f2fs_put_page(page, 1); +cache: + /* cache nat entry */ + cache_nat_entry(NM_I(sbi), nid, &ne); +} + +/** + * The maximum depth is four. + * Offset[0] will have raw inode offset. + */ +static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +{ + const long direct_index = ADDRS_PER_INODE; + const long direct_blks = ADDRS_PER_BLOCK; + const long dptrs_per_blk = NIDS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; + int n = 0; + int level = 0; + + noffset[0] = 0; + + if (block < direct_index) { + offset[n++] = block; + level = 0; + goto got; + } + block -= direct_index; + if (block < direct_blks) { + offset[n++] = NODE_DIR1_BLOCK; + noffset[n] = 1; + offset[n++] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < direct_blks) { + offset[n++] = NODE_DIR2_BLOCK; + noffset[n] = 2; + offset[n++] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND1_BLOCK; + noffset[n] = 3; + offset[n++] = block / direct_blks; + noffset[n] = 4 + offset[n - 1]; + offset[n++] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND2_BLOCK; + noffset[n] = 4 + dptrs_per_blk; + offset[n++] = block / direct_blks; + noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; + offset[n++] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < dindirect_blks) { + offset[n++] = NODE_DIND_BLOCK; + noffset[n] = 5 + (dptrs_per_blk * 2); + offset[n++] = block / indirect_blks; + noffset[n] = 6 + (dptrs_per_blk * 2) + + offset[n - 1] * (dptrs_per_blk + 1); + offset[n++] = (block / direct_blks) % dptrs_per_blk; + noffset[n] = 7 + (dptrs_per_blk * 2) + + offset[n - 2] * (dptrs_per_blk + 1) + + offset[n - 1]; + offset[n++] = block % direct_blks; + level = 3; + goto got; + } else { + BUG(); + } +got: + return level; +} + +/* + * Caller should call f2fs_put_dnode(dn). + */ +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct page *npage[4]; + struct page *parent; + int offset[4]; + unsigned int noffset[4]; + nid_t nids[4]; + int level, i; + int err = 0; + + level = get_node_path(index, offset, noffset); + + nids[0] = dn->inode->i_ino; + npage[0] = get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + + parent = npage[0]; + nids[1] = get_nid(parent, offset[0], true); + dn->inode_page = npage[0]; + dn->inode_page_locked = true; + + /* get indirect or direct nodes */ + for (i = 1; i <= level; i++) { + bool done = false; + + if (!nids[i] && !ro) { + mutex_lock_op(sbi, NODE_NEW); + + /* alloc new node */ + if (!alloc_nid(sbi, &(nids[i]))) { + mutex_unlock_op(sbi, NODE_NEW); + err = -ENOSPC; + goto release_pages; + } + + dn->nid = nids[i]; + npage[i] = new_node_page(dn, noffset[i]); + if (IS_ERR(npage[i])) { + alloc_nid_failed(sbi, nids[i]); + mutex_unlock_op(sbi, NODE_NEW); + err = PTR_ERR(npage[i]); + goto release_pages; + } + + set_nid(parent, offset[i - 1], nids[i], i == 1); + alloc_nid_done(sbi, nids[i]); + mutex_unlock_op(sbi, NODE_NEW); + done = true; + } else if (ro && i == level && level > 1) { + npage[i] = get_node_page_ra(parent, offset[i - 1]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + goto release_pages; + } + done = true; + } + if (i == 1) { + dn->inode_page_locked = false; + unlock_page(parent); + } else { + f2fs_put_page(parent, 1); + } + + if (!done) { + npage[i] = get_node_page(sbi, nids[i]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + f2fs_put_page(npage[0], 0); + goto release_out; + } + } + if (i < level) { + parent = npage[i]; + nids[i + 1] = get_nid(parent, offset[i], false); + } + } + dn->nid = nids[level]; + dn->ofs_in_node = offset[level]; + dn->node_page = npage[level]; + dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + return 0; + +release_pages: + f2fs_put_page(parent, 1); + if (i > 1) + f2fs_put_page(npage[0], 0); +release_out: + dn->inode_page = NULL; + dn->node_page = NULL; + return err; +} + +static void truncate_node(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct node_info ni; + + get_node_info(sbi, dn->nid, &ni); + BUG_ON(ni.blk_addr == NULL_ADDR); + + if (ni.blk_addr != NULL_ADDR) + invalidate_blocks(sbi, ni.blk_addr); + + /* Deallocate node address */ + dec_valid_node_count(sbi, dn->inode, 1); + set_node_addr(sbi, &ni, NULL_ADDR); + + if (dn->nid == dn->inode->i_ino) { + remove_orphan_inode(sbi, dn->nid); + dec_valid_inode_count(sbi); + } else { + sync_inode_page(dn); + } + + clear_node_page_dirty(dn->node_page); + F2FS_SET_SB_DIRT(sbi); + + f2fs_put_page(dn->node_page, 1); + dn->node_page = NULL; +} + +static int truncate_dnode(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct page *page; + + if (dn->nid == 0) + return 1; + + /* get direct node */ + page = get_node_page(sbi, dn->nid); + if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + return 1; + else if (IS_ERR(page)) + return PTR_ERR(page); + + /* Make dnode_of_data for parameter */ + dn->node_page = page; + dn->ofs_in_node = 0; + truncate_data_blocks(dn); + truncate_node(dn); + return 1; +} + +static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, + int ofs, int depth) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct dnode_of_data rdn = *dn; + struct page *page; + struct f2fs_node *rn; + nid_t child_nid; + unsigned int child_nofs; + int freed = 0; + int i, ret; + + if (dn->nid == 0) + return NIDS_PER_BLOCK + 1; + + page = get_node_page(sbi, dn->nid); + if (IS_ERR(page)) + return PTR_ERR(page); + + rn = (struct f2fs_node *)page_address(page); + if (depth < 3) { + for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) + continue; + rdn.nid = child_nid; + ret = truncate_dnode(&rdn); + if (ret < 0) + goto out_err; + set_nid(page, i, 0, false); + } + } else { + child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; + for (i = ofs; i < NIDS_PER_BLOCK; i++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) { + child_nofs += NIDS_PER_BLOCK + 1; + continue; + } + rdn.nid = child_nid; + ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); + if (ret == (NIDS_PER_BLOCK + 1)) { + set_nid(page, i, 0, false); + child_nofs += ret; + } else if (ret < 0 && ret != -ENOENT) { + goto out_err; + } + } + freed = child_nofs; + } + + if (!ofs) { + /* remove current indirect node */ + dn->node_page = page; + truncate_node(dn); + freed++; + } else { + f2fs_put_page(page, 1); + } + return freed; + +out_err: + f2fs_put_page(page, 1); + return ret; +} + +static int truncate_partial_nodes(struct dnode_of_data *dn, + struct f2fs_inode *ri, int *offset, int depth) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct page *pages[2]; + nid_t nid[3]; + nid_t child_nid; + int err = 0; + int i; + int idx = depth - 2; + + nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + if (!nid[0]) + return 0; + + /* get indirect nodes in the path */ + for (i = 0; i < depth - 1; i++) { + /* refernece count'll be increased */ + pages[i] = get_node_page(sbi, nid[i]); + if (IS_ERR(pages[i])) { + depth = i + 1; + err = PTR_ERR(pages[i]); + goto fail; + } + nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + } + + /* free direct nodes linked to a partial indirect node */ + for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { + child_nid = get_nid(pages[idx], i, false); + if (!child_nid) + continue; + dn->nid = child_nid; + err = truncate_dnode(dn); + if (err < 0) + goto fail; + set_nid(pages[idx], i, 0, false); + } + + if (offset[depth - 1] == 0) { + dn->node_page = pages[idx]; + dn->nid = nid[idx]; + truncate_node(dn); + } else { + f2fs_put_page(pages[idx], 1); + } + offset[idx]++; + offset[depth - 1] = 0; +fail: + for (i = depth - 3; i >= 0; i--) + f2fs_put_page(pages[i], 1); + return err; +} + +/** + * All the block addresses of data and nodes should be nullified. + */ +int truncate_inode_blocks(struct inode *inode, pgoff_t from) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int err = 0, cont = 1; + int level, offset[4], noffset[4]; + unsigned int nofs; + struct f2fs_node *rn; + struct dnode_of_data dn; + struct page *page; + + level = get_node_path(from, offset, noffset); + + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + set_new_dnode(&dn, inode, page, NULL, 0); + unlock_page(page); + + rn = page_address(page); + switch (level) { + case 0: + case 1: + nofs = noffset[1]; + break; + case 2: + nofs = noffset[1]; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, &rn->i, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + nofs += 1 + NIDS_PER_BLOCK; + break; + case 3: + nofs = 5 + 2 * NIDS_PER_BLOCK; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, &rn->i, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + break; + default: + BUG(); + } + +skip_partial: + while (cont) { + dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + switch (offset[0]) { + case NODE_DIR1_BLOCK: + case NODE_DIR2_BLOCK: + err = truncate_dnode(&dn); + break; + + case NODE_IND1_BLOCK: + case NODE_IND2_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 2); + break; + + case NODE_DIND_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 3); + cont = 0; + break; + + default: + BUG(); + } + if (err < 0 && err != -ENOENT) + goto fail; + if (offset[1] == 0 && + rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { + lock_page(page); + wait_on_page_writeback(page); + rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + set_page_dirty(page); + unlock_page(page); + } + offset[1] = 0; + offset[0]++; + nofs += err; + } +fail: + f2fs_put_page(page, 0); + return err > 0 ? 0 : err; +} + +int remove_inode_page(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + nid_t ino = inode->i_ino; + struct dnode_of_data dn; + + mutex_lock_op(sbi, NODE_TRUNC); + page = get_node_page(sbi, ino); + if (IS_ERR(page)) { + mutex_unlock_op(sbi, NODE_TRUNC); + return PTR_ERR(page); + } + + if (F2FS_I(inode)->i_xattr_nid) { + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct page *npage = get_node_page(sbi, nid); + + if (IS_ERR(npage)) { + mutex_unlock_op(sbi, NODE_TRUNC); + return PTR_ERR(npage); + } + + F2FS_I(inode)->i_xattr_nid = 0; + set_new_dnode(&dn, inode, page, npage, nid); + dn.inode_page_locked = 1; + truncate_node(&dn); + } + if (inode->i_blocks == 1) { + /* inernally call f2fs_put_page() */ + set_new_dnode(&dn, inode, page, page, ino); + truncate_node(&dn); + } else if (inode->i_blocks == 0) { + struct node_info ni; + get_node_info(sbi, inode->i_ino, &ni); + + /* called after f2fs_new_inode() is failed */ + BUG_ON(ni.blk_addr != NULL_ADDR); + f2fs_put_page(page, 1); + } else { + BUG(); + } + mutex_unlock_op(sbi, NODE_TRUNC); + return 0; +} + +int new_inode_page(struct inode *inode, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + struct dnode_of_data dn; + + /* allocate inode page for new inode */ + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + mutex_lock_op(sbi, NODE_NEW); + page = new_node_page(&dn, 0); + init_dent_inode(dentry, page); + mutex_unlock_op(sbi, NODE_NEW); + if (IS_ERR(page)) + return PTR_ERR(page); + f2fs_put_page(page, 1); + return 0; +} + +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct address_space *mapping = sbi->node_inode->i_mapping; + struct node_info old_ni, new_ni; + struct page *page; + int err; + + if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + return ERR_PTR(-EPERM); + + page = grab_cache_page(mapping, dn->nid); + if (!page) + return ERR_PTR(-ENOMEM); + + get_node_info(sbi, dn->nid, &old_ni); + + SetPageUptodate(page); + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + + /* Reinitialize old_ni with new node page */ + BUG_ON(old_ni.blk_addr != NULL_ADDR); + new_ni = old_ni; + new_ni.ino = dn->inode->i_ino; + + if (!inc_valid_node_count(sbi, dn->inode, 1)) { + err = -ENOSPC; + goto fail; + } + set_node_addr(sbi, &new_ni, NEW_ADDR); + + dn->node_page = page; + sync_inode_page(dn); + set_page_dirty(page); + set_cold_node(dn->inode, page); + if (ofs == 0) + inc_valid_inode_count(sbi); + + return page; + +fail: + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +static int read_node_page(struct page *page, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct node_info ni; + + get_node_info(sbi, page->index, &ni); + + if (ni.blk_addr == NULL_ADDR) + return -ENOENT; + return f2fs_readpage(sbi, page, ni.blk_addr, type); +} + +/** + * Readahead a node page + */ +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct address_space *mapping = sbi->node_inode->i_mapping; + struct page *apage; + + apage = find_get_page(mapping, nid); + if (apage && PageUptodate(apage)) + goto release_out; + f2fs_put_page(apage, 0); + + apage = grab_cache_page(mapping, nid); + if (!apage) + return; + + if (read_node_page(apage, READA)) + goto unlock_out; + + page_cache_release(apage); + return; + +unlock_out: + unlock_page(apage); +release_out: + page_cache_release(apage); +} + +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + int err; + struct page *page; + struct address_space *mapping = sbi->node_inode->i_mapping; + + page = grab_cache_page(mapping, nid); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, READ_SYNC); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + + BUG_ON(nid != nid_of_node(page)); + mark_page_accessed(page); + return page; +} + +/** + * Return a locked page for the desired node page. + * And, readahead MAX_RA_NODE number of node pages. + */ +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); + struct address_space *mapping = sbi->node_inode->i_mapping; + int i, end; + int err = 0; + nid_t nid; + struct page *page; + + /* First, try getting the desired direct node. */ + nid = get_nid(parent, start, false); + if (!nid) + return ERR_PTR(-ENOENT); + + page = find_get_page(mapping, nid); + if (page && PageUptodate(page)) + goto page_hit; + f2fs_put_page(page, 0); + +repeat: + page = grab_cache_page(mapping, nid); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, READA); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start + 1; i < end; i++) { + nid = get_nid(parent, i, false); + if (!nid) + continue; + ra_node_page(sbi, nid); + } + +page_hit: + lock_page(page); + if (PageError(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + + /* Has the page been truncated? */ + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; + } + return page; +} + +void sync_inode_page(struct dnode_of_data *dn) +{ + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { + update_inode(dn->inode, dn->node_page); + } else if (dn->inode_page) { + if (!dn->inode_page_locked) + lock_page(dn->inode_page); + update_inode(dn->inode, dn->inode_page); + if (!dn->inode_page_locked) + unlock_page(dn->inode_page); + } else { + f2fs_write_inode(dn->inode, NULL); + } +} + +int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, + struct writeback_control *wbc) +{ + struct address_space *mapping = sbi->node_inode->i_mapping; + pgoff_t index, end; + struct pagevec pvec; + int step = ino ? 2 : 0; + int nwritten = 0, wrote = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = LONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * flushing sequence with step: + * 0. indirect nodes + * 1. dentry dnodes + * 2. file dnodes + */ + if (step == 0 && IS_DNODE(page)) + continue; + if (step == 1 && (!IS_DNODE(page) || + is_cold_node(page))) + continue; + if (step == 2 && (!IS_DNODE(page) || + !is_cold_node(page))) + continue; + + /* + * If an fsync mode, + * we should not skip writing node pages. + */ + if (ino && ino_of_node(page) == ino) + lock_page(page); + else if (!trylock_page(page)) + continue; + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino && ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + /* called by fsync() */ + if (ino && IS_DNODE(page)) { + int mark = !is_checkpointed_node(sbi, ino); + set_fsync_mark(page, 1); + if (IS_INODE(page)) + set_dentry_mark(page, mark); + nwritten++; + } else { + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + } + mapping->a_ops->writepage(page, wbc); + wrote++; + + if (--wbc->nr_to_write == 0) + break; + } + pagevec_release(&pvec); + cond_resched(); + + if (wbc->nr_to_write == 0) { + step = 2; + break; + } + } + + if (step < 2) { + step++; + goto next_step; + } + + if (wrote) + f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); + + return nwritten; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + nid_t nid; + unsigned int nofs; + block_t new_addr; + struct node_info ni; + + if (wbc->for_reclaim) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; + } + + wait_on_page_writeback(page); + + mutex_lock_op(sbi, NODE_WRITE); + + /* get old block addr of this node page */ + nid = nid_of_node(page); + nofs = ofs_of_node(page); + BUG_ON(page->index != nid); + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (ni.blk_addr == NULL_ADDR) + return 0; + + set_page_writeback(page); + + /* insert node offset */ + write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); + set_node_addr(sbi, &ni, new_addr); + dec_page_count(sbi, F2FS_DIRTY_NODES); + + mutex_unlock_op(sbi, NODE_WRITE); + unlock_page(page); + return 0; +} + +static int f2fs_write_node_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct block_device *bdev = sbi->sb->s_bdev; + long nr_to_write = wbc->nr_to_write; + + if (wbc->for_kupdate) + return 0; + + if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) + return 0; + + if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { + write_checkpoint(sbi, false, false); + return 0; + } + + /* if mounting is failed, skip writing node pages */ + wbc->nr_to_write = bio_get_nr_vecs(bdev); + sync_node_pages(sbi, 0, wbc); + wbc->nr_to_write = nr_to_write - + (bio_get_nr_vecs(bdev) - wbc->nr_to_write); + return 0; +} + +static int f2fs_set_node_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(sbi, F2FS_DIRTY_NODES); + SetPagePrivate(page); + return 1; + } + return 0; +} + +static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + if (PageDirty(page)) + dec_page_count(sbi, F2FS_DIRTY_NODES); + ClearPagePrivate(page); +} + +static int f2fs_release_node_page(struct page *page, gfp_t wait) +{ + ClearPagePrivate(page); + return 0; +} + +/** + * Structure of the f2fs node operations + */ +const struct address_space_operations f2fs_node_aops = { + .writepage = f2fs_write_node_page, + .writepages = f2fs_write_node_pages, + .set_page_dirty = f2fs_set_node_page_dirty, + .invalidatepage = f2fs_invalidate_node_page, + .releasepage = f2fs_release_node_page, +}; + +static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +{ + struct list_head *this; + struct free_nid *i = NULL; + list_for_each(this, head) { + i = list_entry(this, struct free_nid, list); + if (i->nid == n) + break; + i = NULL; + } + return i; +} + +static void __del_from_free_nid_list(struct free_nid *i) +{ + list_del(&i->list); + kmem_cache_free(free_nid_slab, i); +} + +static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +{ + struct free_nid *i; + + if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + return 0; +retry: + i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); + if (!i) { + cond_resched(); + goto retry; + } + i->nid = nid; + i->state = NID_NEW; + + spin_lock(&nm_i->free_nid_list_lock); + if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + return 0; + } + list_add_tail(&i->list, &nm_i->free_nid_list); + nm_i->fcnt++; + spin_unlock(&nm_i->free_nid_list_lock); + return 1; +} + +static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +{ + struct free_nid *i; + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + if (i && i->state == NID_NEW) { + __del_from_free_nid_list(i); + nm_i->fcnt--; + } + spin_unlock(&nm_i->free_nid_list_lock); +} + +static int scan_nat_page(struct f2fs_nm_info *nm_i, + struct page *nat_page, nid_t start_nid) +{ + struct f2fs_nat_block *nat_blk = page_address(nat_page); + block_t blk_addr; + int fcnt = 0; + int i; + + /* 0 nid should not be used */ + if (start_nid == 0) + ++start_nid; + + i = start_nid % NAT_ENTRY_PER_BLOCK; + + for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + BUG_ON(blk_addr == NEW_ADDR); + if (blk_addr == NULL_ADDR) + fcnt += add_free_nid(nm_i, start_nid); + } + return fcnt; +} + +static void build_free_nids(struct f2fs_sb_info *sbi) +{ + struct free_nid *fnid, *next_fnid; + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t nid = 0; + bool is_cycled = false; + int fcnt = 0; + int i; + + nid = nm_i->next_scan_nid; + nm_i->init_scan_nid = nid; + + ra_nat_pages(sbi, nid); + + while (1) { + struct page *page = get_current_nat_page(sbi, nid); + + fcnt += scan_nat_page(nm_i, page, nid); + f2fs_put_page(page, 1); + + nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); + + if (nid >= nm_i->max_nid) { + nid = 0; + is_cycled = true; + } + if (fcnt > MAX_FREE_NIDS) + break; + if (is_cycled && nm_i->init_scan_nid <= nid) + break; + } + + nm_i->next_scan_nid = nid; + + /* find free nids from current sum_pages */ + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < nats_in_cursum(sum); i++) { + block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); + nid = le32_to_cpu(nid_in_journal(sum, i)); + if (addr == NULL_ADDR) + add_free_nid(nm_i, nid); + else + remove_free_nid(nm_i, nid); + } + mutex_unlock(&curseg->curseg_mutex); + + /* remove the free nids from current allocated nids */ + list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { + struct nat_entry *ne; + + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, fnid->nid); + if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + remove_free_nid(nm_i, fnid->nid); + read_unlock(&nm_i->nat_tree_lock); + } +} + +/* + * If this function returns success, caller can obtain a new nid + * from second parameter of this function. + * The returned nid could be used ino as well as nid when inode is created. + */ +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i = NULL; + struct list_head *this; +retry: + mutex_lock(&nm_i->build_lock); + if (!nm_i->fcnt) { + /* scan NAT in order to build free nid list */ + build_free_nids(sbi); + if (!nm_i->fcnt) { + mutex_unlock(&nm_i->build_lock); + return false; + } + } + mutex_unlock(&nm_i->build_lock); + + /* + * We check fcnt again since previous check is racy as + * we didn't hold free_nid_list_lock. So other thread + * could consume all of free nids. + */ + spin_lock(&nm_i->free_nid_list_lock); + if (!nm_i->fcnt) { + spin_unlock(&nm_i->free_nid_list_lock); + goto retry; + } + + BUG_ON(list_empty(&nm_i->free_nid_list)); + list_for_each(this, &nm_i->free_nid_list) { + i = list_entry(this, struct free_nid, list); + if (i->state == NID_NEW) + break; + } + + BUG_ON(i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; +} + +/** + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + if (i) { + BUG_ON(i->state != NID_ALLOC); + __del_from_free_nid_list(i); + } + spin_unlock(&nm_i->free_nid_list_lock); +} + +/** + * alloc_nid() should be called prior to this function. + */ +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +{ + alloc_nid_done(sbi, nid); + add_free_nid(NM_I(sbi), nid); +} + +void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_summary *sum, struct node_info *ni, + block_t new_blkaddr) +{ + rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr); + clear_node_page_dirty(page); +} + +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +{ + struct address_space *mapping = sbi->node_inode->i_mapping; + struct f2fs_node *src, *dst; + nid_t ino = ino_of_node(page); + struct node_info old_ni, new_ni; + struct page *ipage; + + ipage = grab_cache_page(mapping, ino); + if (!ipage) + return -ENOMEM; + + /* Should not use this inode from free nid list */ + remove_free_nid(NM_I(sbi), ino); + + get_node_info(sbi, ino, &old_ni); + SetPageUptodate(ipage); + fill_node_footer(ipage, ino, ino, 0, true); + + src = (struct f2fs_node *)page_address(page); + dst = (struct f2fs_node *)page_address(ipage); + + memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); + dst->i.i_size = 0; + dst->i.i_blocks = 1; + dst->i.i_links = 1; + dst->i.i_xattr_nid = 0; + + new_ni = old_ni; + new_ni.ino = ino; + + set_node_addr(sbi, &new_ni, NEW_ADDR); + inc_valid_inode_count(sbi); + + f2fs_put_page(ipage, 1); + return 0; +} + +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum) +{ + struct f2fs_node *rn; + struct f2fs_summary *sum_entry; + struct page *page; + block_t addr; + int i, last_offset; + + /* alloc temporal page for read node */ + page = alloc_page(GFP_NOFS | __GFP_ZERO); + if (IS_ERR(page)) + return PTR_ERR(page); + lock_page(page); + + /* scan the node segment */ + last_offset = sbi->blocks_per_seg; + addr = START_BLOCK(sbi, segno); + sum_entry = &sum->entries[0]; + + for (i = 0; i < last_offset; i++, sum_entry++) { + if (f2fs_readpage(sbi, page, addr, READ_SYNC)) + goto out; + + rn = (struct f2fs_node *)page_address(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + addr++; + + /* + * In order to read next node page, + * we must clear PageUptodate flag. + */ + ClearPageUptodate(page); + } +out: + unlock_page(page); + __free_pages(page, 0); + return 0; +} + +static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int i; + + mutex_lock(&curseg->curseg_mutex); + + if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { + mutex_unlock(&curseg->curseg_mutex); + return false; + } + + for (i = 0; i < nats_in_cursum(sum); i++) { + struct nat_entry *ne; + struct f2fs_nat_entry raw_ne; + nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + + raw_ne = nat_in_journal(sum, i); +retry: + write_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne) { + __set_nat_cache_dirty(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + continue; + } + ne = grab_nat_entry(nm_i, nid); + if (!ne) { + write_unlock(&nm_i->nat_tree_lock); + goto retry; + } + nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); + nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); + nat_set_version(ne, raw_ne.version); + __set_nat_cache_dirty(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } + update_nats_in_cursum(sum, -i); + mutex_unlock(&curseg->curseg_mutex); + return true; +} + +/** + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct list_head *cur, *n; + struct page *page = NULL; + struct f2fs_nat_block *nat_blk = NULL; + nid_t start_nid = 0, end_nid = 0; + bool flushed; + + flushed = flush_nats_in_journal(sbi); + + if (!flushed) + mutex_lock(&curseg->curseg_mutex); + + /* 1) flush dirty nat caches */ + list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { + struct nat_entry *ne; + nid_t nid; + struct f2fs_nat_entry raw_ne; + int offset = -1; + block_t old_blkaddr, new_blkaddr; + + ne = list_entry(cur, struct nat_entry, list); + nid = nat_get_nid(ne); + + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; + if (flushed) + goto to_nat_page; + + /* if there is room for nat enries in curseg->sumpage */ + offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); + if (offset >= 0) { + raw_ne = nat_in_journal(sum, offset); + old_blkaddr = le32_to_cpu(raw_ne.block_addr); + goto flush_now; + } +to_nat_page: + if (!page || (start_nid > nid || nid > end_nid)) { + if (page) { + f2fs_put_page(page, 1); + page = NULL; + } + start_nid = START_NID(nid); + end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + + /* + * get nat block with dirty flag, increased reference + * count, mapped and lock + */ + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + } + + BUG_ON(!nat_blk); + raw_ne = nat_blk->entries[nid - start_nid]; + old_blkaddr = le32_to_cpu(raw_ne.block_addr); +flush_now: + new_blkaddr = nat_get_blkaddr(ne); + + raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); + raw_ne.block_addr = cpu_to_le32(new_blkaddr); + raw_ne.version = nat_get_version(ne); + + if (offset < 0) { + nat_blk->entries[nid - start_nid] = raw_ne; + } else { + nat_in_journal(sum, offset) = raw_ne; + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } + + if (nat_get_blkaddr(ne) == NULL_ADDR) { + write_lock(&nm_i->nat_tree_lock); + __del_from_nat_cache(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + + /* We can reuse this freed nid at this point */ + add_free_nid(NM_I(sbi), nid); + } else { + write_lock(&nm_i->nat_tree_lock); + __clear_nat_cache_dirty(nm_i, ne); + ne->checkpointed = true; + write_unlock(&nm_i->nat_tree_lock); + } + } + if (!flushed) + mutex_unlock(&curseg->curseg_mutex); + f2fs_put_page(page, 1); + + /* 2) shrink nat caches if necessary */ + try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); +} + +static int init_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned char *version_bitmap; + unsigned int nat_segs, nat_blocks; + + nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); + + /* segment_count_nat includes pair segment so divide to 2. */ + nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; + nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->fcnt = 0; + nm_i->nat_cnt = 0; + + INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); + INIT_LIST_HEAD(&nm_i->nat_entries); + INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + + mutex_init(&nm_i->build_lock); + spin_lock_init(&nm_i->free_nid_list_lock); + rwlock_init(&nm_i->nat_tree_lock); + + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); + nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + + nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; + version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); + if (!version_bitmap) + return -EFAULT; + + /* copy version bitmap */ + memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); + return 0; +} + +int build_node_manager(struct f2fs_sb_info *sbi) +{ + int err; + + sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + if (!sbi->nm_info) + return -ENOMEM; + + err = init_node_manager(sbi); + if (err) + return err; + + build_free_nids(sbi); + return 0; +} + +void destroy_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next_i; + struct nat_entry *natvec[NATVEC_SIZE]; + nid_t nid = 0; + unsigned int found; + + if (!nm_i) + return; + + /* destroy free nid list */ + spin_lock(&nm_i->free_nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + BUG_ON(i->state == NID_ALLOC); + __del_from_free_nid_list(i); + nm_i->fcnt--; + } + BUG_ON(nm_i->fcnt); + spin_unlock(&nm_i->free_nid_list_lock); + + /* destroy nat cache */ + write_lock(&nm_i->nat_tree_lock); + while ((found = __gang_lookup_nat_cache(nm_i, + nid, NATVEC_SIZE, natvec))) { + unsigned idx; + for (idx = 0; idx < found; idx++) { + struct nat_entry *e = natvec[idx]; + nid = nat_get_nid(e) + 1; + __del_from_nat_cache(nm_i, e); + } + } + BUG_ON(nm_i->nat_cnt); + write_unlock(&nm_i->nat_tree_lock); + + kfree(nm_i->nat_bitmap); + sbi->nm_info = NULL; + kfree(nm_i); +} + +int create_node_manager_caches(void) +{ + nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + sizeof(struct nat_entry), NULL); + if (!nat_entry_slab) + return -ENOMEM; + + free_nid_slab = f2fs_kmem_cache_create("free_nid", + sizeof(struct free_nid), NULL); + if (!free_nid_slab) { + kmem_cache_destroy(nat_entry_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_node_manager_caches(void) +{ + kmem_cache_destroy(free_nid_slab); + kmem_cache_destroy(nat_entry_slab); +} -- cgit v1.2.1 From 351df4b201157351c7d26bf12c3eeb9dbce98854 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:09:16 +0900 Subject: f2fs: add segment operations This adds specific functions not only to manage dirty/free segments, SIT pages, a cache for SIT entries, and summary entries, but also to allocate free blocks and write three types of pages: data, node, and meta. - F2FS maintains three types of bitmaps in memory, which indicate free, prefree, and dirty segments respectively. - The key information of an SIT entry consists of a segment number, the number of valid blocks in the segment, a bitmap to identify there-in valid or invalid blocks. - An SIT page is composed of a certain range of SIT entries, which is maintained by the address space of meta_inode. - To cache SIT entries, a simple array is used. The index for the array is the segment number. - A summary entry for data contains the parent node information. A summary entry for node contains its node offset from the inode. - F2FS manages information about six active logs and those summary entries in memory. Whenever one of them is changed, its summary entries are flushed to its SIT page maintained by the address space of meta_inode. - This patch adds a default block allocation function which supports heap-based allocation policy. - This patch adds core functions to write data, node, and meta pages. Since LFS basically produces a series of sequential writes, F2FS merges sequential bios with a single one as much as possible to reduce the IO scheduling overhead. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1798 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1798 insertions(+) create mode 100644 fs/f2fs/segment.c (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c new file mode 100644 index 000000000000..ed7c079cfc7f --- /dev/null +++ b/fs/f2fs/segment.c @@ -0,0 +1,1798 @@ +/** + * fs/f2fs/segment.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "node.h" + +static int need_to_flush(struct f2fs_sb_info *sbi) +{ + unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * + sbi->segs_per_sec; + int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + + if (sbi->por_doing) + return 0; + + if (free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi))) + return 1; + return 0; +} + +/** + * This function balances dirty node and dentry pages. + * In addition, it controls garbage collection. + */ +void f2fs_balance_fs(struct f2fs_sb_info *sbi) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + + if (sbi->por_doing) + return; + + /* + * We should do checkpoint when there are so many dirty node pages + * with enough free segments. After then, we should do GC. + */ + if (need_to_flush(sbi)) { + sync_dirty_dir_inodes(sbi); + sync_node_pages(sbi, 0, &wbc); + } + + if (has_not_enough_free_secs(sbi)) { + mutex_lock(&sbi->gc_mutex); + f2fs_gc(sbi, 1); + } +} + +static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + /* need not be added */ + if (IS_CURSEG(sbi, segno)) + return; + + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]++; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + dirty_type = sentry->type; + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]++; + } +} + +static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]--; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + dirty_type = sentry->type; + if (test_and_clear_bit(segno, + dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]--; + clear_bit(segno, dirty_i->victim_segmap[FG_GC]); + clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + } +} + +/** + * Should not occur error such as -ENOMEM. + * Adding dirty entry into seglist is not critical operation. + * If a given segment is one of current working segments, it won't be added. + */ +void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned short valid_blocks; + + if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + return; + + mutex_lock(&dirty_i->seglist_lock); + + valid_blocks = get_valid_blocks(sbi, segno, 0); + + if (valid_blocks == 0) { + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } else if (valid_blocks < sbi->blocks_per_seg) { + __locate_dirty_segment(sbi, segno, DIRTY); + } else { + /* Recovery routine with SSR needs this */ + __remove_dirty_segment(sbi, segno, DIRTY); + } + + mutex_unlock(&dirty_i->seglist_lock); + return; +} + +/** + * Should call clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno, offset = 0; + unsigned int total_segs = TOTAL_SEGS(sbi); + + mutex_lock(&dirty_i->seglist_lock); + while (1) { + segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, + offset); + if (segno >= total_segs) + break; + __set_test_and_free(sbi, segno); + offset = segno + 1; + } + mutex_unlock(&dirty_i->seglist_lock); +} + +void clear_prefree_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno, offset = 0; + unsigned int total_segs = TOTAL_SEGS(sbi); + + mutex_lock(&dirty_i->seglist_lock); + while (1) { + segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, + offset); + if (segno >= total_segs) + break; + + offset = segno + 1; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) + dirty_i->nr_dirty[PRE]--; + + /* Let's use trim */ + if (test_opt(sbi, DISCARD)) + blkdev_issue_discard(sbi->sb->s_bdev, + START_BLOCK(sbi, segno) << + sbi->log_sectors_per_block, + 1 << (sbi->log_sectors_per_block + + sbi->log_blocks_per_seg), + GFP_NOFS, 0); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) + sit_i->dirty_sentries++; +} + +static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, + unsigned int segno, int modified) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; + if (modified) + __mark_sit_entry_dirty(sbi, segno); +} + +static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) +{ + struct seg_entry *se; + unsigned int segno, offset; + long int new_vblocks; + + segno = GET_SEGNO(sbi, blkaddr); + + se = get_seg_entry(sbi, segno); + new_vblocks = se->valid_blocks + del; + offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + + BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || + (new_vblocks > sbi->blocks_per_seg))); + + se->valid_blocks = new_vblocks; + se->mtime = get_mtime(sbi); + SIT_I(sbi)->max_mtime = se->mtime; + + /* Update valid block bitmap */ + if (del > 0) { + if (f2fs_set_bit(offset, se->cur_valid_map)) + BUG(); + } else { + if (!f2fs_clear_bit(offset, se->cur_valid_map)) + BUG(); + } + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks += del; + + __mark_sit_entry_dirty(sbi, segno); + + /* update total number of valid blocks to be written in ckpt area */ + SIT_I(sbi)->written_valid_blocks += del; + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, segno)->valid_blocks += del; +} + +static void refresh_sit_entry(struct f2fs_sb_info *sbi, + block_t old_blkaddr, block_t new_blkaddr) +{ + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); +} + +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +{ + unsigned int segno = GET_SEGNO(sbi, addr); + struct sit_info *sit_i = SIT_I(sbi); + + BUG_ON(addr == NULL_ADDR); + if (addr == NEW_ADDR) + return; + + /* add it into sit main buffer */ + mutex_lock(&sit_i->sentry_lock); + + update_sit_entry(sbi, addr, -1); + + /* add it into dirty seglist */ + locate_dirty_segment(sbi, segno); + + mutex_unlock(&sit_i->sentry_lock); +} + +/** + * This function should be resided under the curseg_mutex lock + */ +static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, + struct f2fs_summary *sum, unsigned short offset) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + void *addr = curseg->sum_blk; + addr += offset * sizeof(struct f2fs_summary); + memcpy(addr, sum, sizeof(struct f2fs_summary)); + return; +} + +/** + * Calculate the number of current summary pages for writing + */ +int npages_for_summary_flush(struct f2fs_sb_info *sbi) +{ + int total_size_bytes = 0; + int valid_sum_count = 0; + int i, sum_space; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (sbi->ckpt->alloc_type[i] == SSR) + valid_sum_count += sbi->blocks_per_seg; + else + valid_sum_count += curseg_blkoff(sbi, i); + } + + total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) + + sizeof(struct nat_journal) + 2 + + sizeof(struct sit_journal) + 2; + sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; + if (total_size_bytes < sum_space) + return 1; + else if (total_size_bytes < 2 * sum_space) + return 2; + return 3; +} + +/** + * Caller should put this summary page + */ +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +{ + return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + struct page *page = grab_meta_page(sbi, blk_addr); + void *kaddr = page_address(page); + memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, + int ofs_unit, int type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; + unsigned int segno, next_segno, i; + int ofs = 0; + + /* + * If there is not enough reserved sections, + * we should not reuse prefree segments. + */ + if (has_not_enough_free_secs(sbi)) + return NULL_SEGNO; + + /* + * NODE page should not reuse prefree segment, + * since those information is used for SPOR. + */ + if (IS_NODESEG(type)) + return NULL_SEGNO; +next: + segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); + ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; + if (segno < TOTAL_SEGS(sbi)) { + /* skip intermediate segments in a section */ + if (segno % ofs_unit) + goto next; + + /* skip if whole section is not prefree */ + next_segno = find_next_zero_bit(prefree_segmap, + TOTAL_SEGS(sbi), segno + 1); + if (next_segno - segno < ofs_unit) + goto next; + + /* skip if whole section was not free at the last checkpoint */ + for (i = 0; i < ofs_unit; i++) + if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) + goto next; + return segno; + } + return NULL_SEGNO; +} + +/** + * Find a new segment from the free segments bitmap to right order + * This function should be returned with success, otherwise BUG + */ +static void get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int total_secs = sbi->total_sections; + unsigned int segno, secno, zoneno; + unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; + unsigned int hint = *newseg / sbi->segs_per_sec; + unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int left_start = hint; + bool init = true; + int go_left = 0; + int i; + + write_lock(&free_i->segmap_lock); + + if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + segno = find_next_zero_bit(free_i->free_segmap, + TOTAL_SEGS(sbi), *newseg + 1); + if (segno < TOTAL_SEGS(sbi)) + goto got_it; + } +find_other_zone: + secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); + if (secno >= total_secs) { + if (dir == ALLOC_RIGHT) { + secno = find_next_zero_bit(free_i->free_secmap, + total_secs, 0); + BUG_ON(secno >= total_secs); + } else { + go_left = 1; + left_start = hint - 1; + } + } + if (go_left == 0) + goto skip_left; + + while (test_bit(left_start, free_i->free_secmap)) { + if (left_start > 0) { + left_start--; + continue; + } + left_start = find_next_zero_bit(free_i->free_secmap, + total_secs, 0); + BUG_ON(left_start >= total_secs); + break; + } + secno = left_start; +skip_left: + hint = secno; + segno = secno * sbi->segs_per_sec; + zoneno = secno / sbi->secs_per_zone; + + /* give up on finding another zone */ + if (!init) + goto got_it; + if (sbi->secs_per_zone == 1) + goto got_it; + if (zoneno == old_zoneno) + goto got_it; + if (dir == ALLOC_LEFT) { + if (!go_left && zoneno + 1 >= total_zones) + goto got_it; + if (go_left && zoneno == 0) + goto got_it; + } + for (i = 0; i < NR_CURSEG_TYPE; i++) + if (CURSEG_I(sbi, i)->zone == zoneno) + break; + + if (i < NR_CURSEG_TYPE) { + /* zone is in user, try another */ + if (go_left) + hint = zoneno * sbi->secs_per_zone - 1; + else if (zoneno + 1 >= total_zones) + hint = 0; + else + hint = (zoneno + 1) * sbi->secs_per_zone; + init = false; + goto find_other_zone; + } +got_it: + /* set it as dirty segment in free segmap */ + BUG_ON(test_bit(segno, free_i->free_segmap)); + __set_inuse(sbi, segno); + *newseg = segno; + write_unlock(&free_i->segmap_lock); +} + +static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct summary_footer *sum_footer; + + curseg->segno = curseg->next_segno; + curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->next_blkoff = 0; + curseg->next_segno = NULL_SEGNO; + + sum_footer = &(curseg->sum_blk->footer); + memset(sum_footer, 0, sizeof(struct summary_footer)); + if (IS_DATASEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); + if (IS_NODESEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); + __set_sit_entry_type(sbi, type, curseg->segno, modified); +} + +/** + * Allocate a current working segment. + * This function always allocates a free segment in LFS manner. + */ +static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno; + int dir = ALLOC_LEFT; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + dir = ALLOC_RIGHT; + + if (test_opt(sbi, NOHEAP)) + dir = ALLOC_RIGHT; + + get_new_segment(sbi, &segno, new_sec, dir); + curseg->next_segno = segno; + reset_curseg(sbi, type, 1); + curseg->alloc_type = LFS; +} + +static void __next_free_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg, block_t start) +{ + struct seg_entry *se = get_seg_entry(sbi, seg->segno); + block_t ofs; + for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { + if (!f2fs_test_bit(ofs, se->ckpt_valid_map) + && !f2fs_test_bit(ofs, se->cur_valid_map)) + break; + } + seg->next_blkoff = ofs; +} + +/** + * If a segment is written by LFS manner, next block offset is just obtained + * by increasing the current block offset. However, if a segment is written by + * SSR manner, next block offset obtained by calling __next_free_blkoff + */ +static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + if (seg->alloc_type == SSR) + __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + else + seg->next_blkoff++; +} + +/** + * This function always allocates a used segment (from dirty seglist) by SSR + * manner, so it should recover the existing segment information of valid blocks + */ +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int new_segno = curseg->next_segno; + struct f2fs_summary_block *sum_node; + struct page *sum_page; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); + + mutex_lock(&dirty_i->seglist_lock); + __remove_dirty_segment(sbi, new_segno, PRE); + __remove_dirty_segment(sbi, new_segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + + reset_curseg(sbi, type, 1); + curseg->alloc_type = SSR; + __next_free_blkoff(sbi, curseg, 0); + + if (reuse) { + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); + } +} + +/* + * flush out current segment and replace it with new segment + * This function should be returned with success, otherwise BUG + */ +static void allocate_segment_by_default(struct f2fs_sb_info *sbi, + int type, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int ofs_unit; + + if (force) { + new_curseg(sbi, type, true); + goto out; + } + + ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; + curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); + + if (curseg->next_segno != NULL_SEGNO) + change_curseg(sbi, type, false); + else if (type == CURSEG_WARM_NODE) + new_curseg(sbi, type, false); + else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) + change_curseg(sbi, type, true); + else + new_curseg(sbi, type, false); +out: + sbi->segment_count[curseg->alloc_type]++; +} + +void allocate_new_segments(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg; + unsigned int old_curseg; + int i; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_curseg = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_curseg); + } +} + +static const struct segment_allocation default_salloc_ops = { + .allocate_segment = allocate_segment_by_default, +}; + +static void f2fs_end_io_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_private *p = bio->bi_private; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + p->sbi->ckpt->ckpt_flags |= CP_ERROR_FLAG; + set_page_dirty(page); + } + end_page_writeback(page); + dec_page_count(p->sbi, F2FS_WRITEBACK); + } while (bvec >= bio->bi_io_vec); + + if (p->is_sync) + complete(p->wait); + kfree(p); + bio_put(bio); +} + +struct bio *f2fs_bio_alloc(struct block_device *bdev, sector_t first_sector, + int nr_vecs, gfp_t gfp_flags) +{ + struct bio *bio; +repeat: + /* allocate new bio */ + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; +retry: + bio->bi_private = kmalloc(sizeof(struct bio_private), + GFP_NOFS | __GFP_HIGH); + if (!bio->bi_private) { + cond_resched(); + goto retry; + } + } + if (bio == NULL) { + cond_resched(); + goto repeat; + } + return bio; +} + +static void do_submit_bio(struct f2fs_sb_info *sbi, + enum page_type type, bool sync) +{ + int rw = sync ? WRITE_SYNC : WRITE; + enum page_type btype = type > META ? META : type; + + if (type >= META_FLUSH) + rw = WRITE_FLUSH_FUA; + + if (sbi->bio[btype]) { + struct bio_private *p = sbi->bio[btype]->bi_private; + p->sbi = sbi; + sbi->bio[btype]->bi_end_io = f2fs_end_io_write; + if (type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + p->is_sync = true; + p->wait = &wait; + submit_bio(rw, sbi->bio[btype]); + wait_for_completion(&wait); + } else { + p->is_sync = false; + submit_bio(rw, sbi->bio[btype]); + } + sbi->bio[btype] = NULL; + } +} + +void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) +{ + down_write(&sbi->bio_sem); + do_submit_bio(sbi, type, sync); + up_write(&sbi->bio_sem); +} + +static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, enum page_type type) +{ + struct block_device *bdev = sbi->sb->s_bdev; + + verify_block_addr(sbi, blk_addr); + + down_write(&sbi->bio_sem); + + inc_page_count(sbi, F2FS_WRITEBACK); + + if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) + do_submit_bio(sbi, type, false); +alloc_new: + if (sbi->bio[type] == NULL) + sbi->bio[type] = f2fs_bio_alloc(bdev, + blk_addr << (sbi->log_blocksize - 9), + bio_get_nr_vecs(bdev), GFP_NOFS | __GFP_HIGH); + + if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + do_submit_bio(sbi, type, false); + goto alloc_new; + } + + sbi->last_block_in_bio[type] = blk_addr; + + up_write(&sbi->bio_sem); +} + +static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (curseg->next_blkoff < sbi->blocks_per_seg) + return true; + return false; +} + +static int __get_segment_type_2(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) + return CURSEG_HOT_DATA; + else + return CURSEG_HOT_NODE; +} + +static int __get_segment_type_4(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) { + struct inode *inode = page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else + return CURSEG_COLD_DATA; + } else { + if (IS_DNODE(page) && !is_cold_node(page)) + return CURSEG_HOT_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type_6(struct page *page, enum page_type p_type) +{ + if (p_type == DATA) { + struct inode *inode = page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else if (is_cold_data(page) || is_cold_file(inode)) + return CURSEG_COLD_DATA; + else + return CURSEG_WARM_DATA; + } else { + if (IS_DNODE(page)) + return is_cold_node(page) ? CURSEG_WARM_NODE : + CURSEG_HOT_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type(struct page *page, enum page_type p_type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + switch (sbi->active_logs) { + case 2: + return __get_segment_type_2(page, p_type); + case 4: + return __get_segment_type_4(page, p_type); + case 6: + return __get_segment_type_6(page, p_type); + default: + BUG(); + } +} + +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, enum page_type p_type) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + unsigned int old_cursegno; + int type; + + type = __get_segment_type(page, p_type); + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + + *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + old_cursegno = curseg->segno; + + /* + * __add_sum_entry should be resided under the curseg_mutex + * because, this function updates a summary entry in the + * current summary block. + */ + __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + + mutex_lock(&sit_i->sentry_lock); + __refresh_next_blkoff(sbi, curseg); + sbi->block_count[curseg->alloc_type]++; + + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + + locate_dirty_segment(sbi, old_cursegno); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); + + if (p_type == NODE) + fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + + /* writeout dirty page into bdev */ + submit_write_page(sbi, page, *new_blkaddr, p_type); + + mutex_unlock(&curseg->curseg_mutex); +} + +int write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + struct writeback_control *wbc) +{ + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; + + set_page_writeback(page); + submit_write_page(sbi, page, page->index, META); + return 0; +} + +void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) +{ + struct f2fs_summary sum; + set_summary(&sum, nid, 0, 0); + do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); +} + +void write_data_page(struct inode *inode, struct page *page, + struct dnode_of_data *dn, block_t old_blkaddr, + block_t *new_blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_summary sum; + struct node_info ni; + + BUG_ON(old_blkaddr == NULL_ADDR); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + + do_write_page(sbi, page, old_blkaddr, + new_blkaddr, &sum, DATA); +} + +void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blk_addr) +{ + submit_write_page(sbi, page, old_blk_addr, DATA); +} + +void recover_data_page(struct f2fs_sb_info *sbi, + struct page *page, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + unsigned int segno, old_cursegno; + struct seg_entry *se; + int type; + + segno = GET_SEGNO(sbi, new_blkaddr); + se = get_seg_entry(sbi, segno); + type = se->type; + + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + old_cursegno = curseg->segno; + + /* change the current segment */ + if (segno != curseg->segno) { + curseg->next_segno = segno; + change_curseg(sbi, type, true); + } + + curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & + (sbi->blocks_per_seg - 1); + __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + + refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + + locate_dirty_segment(sbi, old_cursegno); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + + mutex_unlock(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); +} + +void rewrite_node_page(struct f2fs_sb_info *sbi, + struct page *page, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + int type = CURSEG_WARM_NODE; + struct curseg_info *curseg; + unsigned int segno, old_cursegno; + block_t next_blkaddr = next_blkaddr_of_node(page); + unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, new_blkaddr); + old_cursegno = curseg->segno; + + /* change the current segment */ + if (segno != curseg->segno) { + curseg->next_segno = segno; + change_curseg(sbi, type, true); + } + curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & + (sbi->blocks_per_seg - 1); + __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + + /* change the current log to the next block addr in advance */ + if (next_segno != segno) { + curseg->next_segno = next_segno; + change_curseg(sbi, type, true); + } + curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & + (sbi->blocks_per_seg - 1); + + /* rewrite node page */ + set_page_writeback(page); + submit_write_page(sbi, page, new_blkaddr, NODE); + f2fs_submit_bio(sbi, NODE, true); + refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + + locate_dirty_segment(sbi, old_cursegno); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + + mutex_unlock(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); +} + +static int read_compacted_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *seg_i; + unsigned char *kaddr; + struct page *page; + block_t start; + int i, j, offset; + + start = start_sum_block(sbi); + + page = get_meta_page(sbi, start++); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: restore nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + + /* Step 2: restore sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, + SUM_JOURNAL_SIZE); + offset = 2 * SUM_JOURNAL_SIZE; + + /* Step 3: restore summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blk_off; + unsigned int segno; + + seg_i = CURSEG_I(sbi, i); + segno = le32_to_cpu(ckpt->cur_data_segno[i]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); + seg_i->next_segno = segno; + reset_curseg(sbi, i, 0); + seg_i->alloc_type = ckpt->alloc_type[i]; + seg_i->next_blkoff = blk_off; + + if (seg_i->alloc_type == SSR) + blk_off = sbi->blocks_per_seg; + + for (j = 0; j < blk_off; j++) { + struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); + seg_i->sum_blk->entries[j] = *s; + offset += SUMMARY_SIZE; + if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + SUM_FOOTER_SIZE) + continue; + + f2fs_put_page(page, 1); + page = NULL; + + page = get_meta_page(sbi, start++); + kaddr = (unsigned char *)page_address(page); + offset = 0; + } + } + f2fs_put_page(page, 1); + return 0; +} + +static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_summary_block *sum; + struct curseg_info *curseg; + struct page *new; + unsigned short blk_off; + unsigned int segno = 0; + block_t blk_addr = 0; + + /* get segment number and block addr */ + if (IS_DATASEG(type)) { + segno = le32_to_cpu(ckpt->cur_data_segno[type]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - + CURSEG_HOT_DATA]); + if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + else + blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); + } else { + segno = le32_to_cpu(ckpt->cur_node_segno[type - + CURSEG_HOT_NODE]); + blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - + CURSEG_HOT_NODE]); + if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, + type - CURSEG_HOT_NODE); + else + blk_addr = GET_SUM_BLOCK(sbi, segno); + } + + new = get_meta_page(sbi, blk_addr); + sum = (struct f2fs_summary_block *)page_address(new); + + if (IS_NODESEG(type)) { + if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) { + struct f2fs_summary *ns = &sum->entries[0]; + int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + ns->version = 0; + ns->ofs_in_node = 0; + } + } else { + if (restore_node_summary(sbi, segno, sum)) { + f2fs_put_page(new, 1); + return -EINVAL; + } + } + } + + /* set uncompleted segment to curseg */ + curseg = CURSEG_I(sbi, type); + mutex_lock(&curseg->curseg_mutex); + memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + curseg->next_segno = segno; + reset_curseg(sbi, type, 0); + curseg->alloc_type = ckpt->alloc_type[type]; + curseg->next_blkoff = blk_off; + mutex_unlock(&curseg->curseg_mutex); + f2fs_put_page(new, 1); + return 0; +} + +static int restore_curseg_summaries(struct f2fs_sb_info *sbi) +{ + int type = CURSEG_HOT_DATA; + + if (sbi->ckpt->ckpt_flags & CP_COMPACT_SUM_FLAG) { + /* restore for compacted data summary */ + if (read_compacted_summaries(sbi)) + return -EINVAL; + type = CURSEG_HOT_NODE; + } + + for (; type <= CURSEG_COLD_NODE; type++) + if (read_normal_summaries(sbi, type)) + return -EINVAL; + return 0; +} + +static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct page *page; + unsigned char *kaddr; + struct f2fs_summary *summary; + struct curseg_info *seg_i; + int written_size = 0; + int i, j; + + page = grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: write nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 2: write sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, + SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + set_page_dirty(page); + + /* Step 3: write summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); + if (sbi->ckpt->alloc_type[i] == SSR) + blkoff = sbi->blocks_per_seg; + else + blkoff = curseg_blkoff(sbi, i); + + for (j = 0; j < blkoff; j++) { + if (!page) { + page = grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + written_size = 0; + } + summary = (struct f2fs_summary *)(kaddr + written_size); + *summary = seg_i->sum_blk->entries[j]; + written_size += SUMMARY_SIZE; + set_page_dirty(page); + + if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + SUM_FOOTER_SIZE) + continue; + + f2fs_put_page(page, 1); + page = NULL; + } + } + if (page) + f2fs_put_page(page, 1); +} + +static void write_normal_summaries(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + int i, end; + if (IS_DATASEG(type)) + end = type + NR_CURSEG_DATA_TYPE; + else + end = type + NR_CURSEG_NODE_TYPE; + + for (i = type; i < end; i++) { + struct curseg_info *sum = CURSEG_I(sbi, i); + mutex_lock(&sum->curseg_mutex); + write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); + mutex_unlock(&sum->curseg_mutex); + } +} + +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + if (sbi->ckpt->ckpt_flags & CP_COMPACT_SUM_FLAG) + write_compacted_summaries(sbi, start_blk); + else + write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); +} + +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + if (sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); + return; +} + +int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, + unsigned int val, int alloc) +{ + int i; + + if (type == NAT_JOURNAL) { + for (i = 0; i < nats_in_cursum(sum); i++) { + if (le32_to_cpu(nid_in_journal(sum, i)) == val) + return i; + } + if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + return update_nats_in_cursum(sum, 1); + } else if (type == SIT_JOURNAL) { + for (i = 0; i < sits_in_cursum(sum); i++) + if (le32_to_cpu(segno_in_journal(sum, i)) == val) + return i; + if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + return update_sits_in_cursum(sum, 1); + } + return -1; +} + +static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); + block_t blk_addr = sit_i->sit_base_addr + offset; + + check_seg_range(sbi, segno); + + /* calculate sit block address */ + if (f2fs_test_bit(offset, sit_i->sit_bitmap)) + blk_addr += sit_i->sit_blocks; + + return get_meta_page(sbi, blk_addr); +} + +static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct page *src_page, *dst_page; + pgoff_t src_off, dst_off; + void *src_addr, *dst_addr; + + src_off = current_sit_addr(sbi, start); + dst_off = next_sit_addr(sbi, src_off); + + /* get current sit block page without lock */ + src_page = get_meta_page(sbi, src_off); + dst_page = grab_meta_page(sbi, dst_off); + BUG_ON(PageDirty(src_page)); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_sit(sit_i, start); + + return dst_page; +} + +static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + int i; + + /* + * If the journal area in the current summary is full of sit entries, + * all the sit entries will be flushed. Otherwise the sit entries + * are not able to replace with newly hot sit entries. + */ + if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { + for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + unsigned int segno; + segno = le32_to_cpu(segno_in_journal(sum, i)); + __mark_sit_entry_dirty(sbi, segno); + } + update_sits_in_cursum(sum, -sits_in_cursum(sum)); + return 1; + } + return 0; +} + +/** + * CP calls this function, which flushes SIT entries including sit_journal, + * and moves prefree segs to free segs. + */ +void flush_sit_entries(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned long *bitmap = sit_i->dirty_sentries_bitmap; + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + unsigned long nsegs = TOTAL_SEGS(sbi); + struct page *page = NULL; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start = 0, end = 0; + unsigned int segno = -1; + bool flushed; + + mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); + + /* + * "flushed" indicates whether sit entries in journal are flushed + * to the SIT area or not. + */ + flushed = flush_sits_in_journal(sbi); + + while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { + struct seg_entry *se = get_seg_entry(sbi, segno); + int sit_offset, offset; + + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + + if (flushed) + goto to_sit_page; + + offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); + if (offset >= 0) { + segno_in_journal(sum, offset) = cpu_to_le32(segno); + seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); + goto flush_done; + } +to_sit_page: + if (!page || (start > segno) || (segno > end)) { + if (page) { + f2fs_put_page(page, 1); + page = NULL; + } + + start = START_SEGNO(sit_i, segno); + end = start + SIT_ENTRY_PER_BLOCK - 1; + + /* read sit block that will be updated */ + page = get_next_sit_page(sbi, start); + raw_sit = page_address(page); + } + + /* udpate entry in SIT block */ + seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); +flush_done: + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + } + mutex_unlock(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + /* writeout last modified SIT block */ + f2fs_put_page(page, 1); + + set_prefree_as_free_segments(sbi); +} + +static int build_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct sit_info *sit_i; + unsigned int sit_segs, start; + char *src_bitmap, *dst_bitmap; + unsigned int bitmap_size; + + /* allocate memory for SIT information */ + sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + if (!sit_i) + return -ENOMEM; + + SM_I(sbi)->sit_info = sit_i; + + sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); + if (!sit_i->sentries) + return -ENOMEM; + + bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!sit_i->dirty_sentries_bitmap) + return -ENOMEM; + + for (start = 0; start < TOTAL_SEGS(sbi); start++) { + sit_i->sentries[start].cur_valid_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->sentries[start].ckpt_valid_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map + || !sit_i->sentries[start].ckpt_valid_map) + return -ENOMEM; + } + + if (sbi->segs_per_sec > 1) { + sit_i->sec_entries = vzalloc(sbi->total_sections * + sizeof(struct sec_entry)); + if (!sit_i->sec_entries) + return -ENOMEM; + } + + /* get information related with SIT */ + sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; + + /* setup SIT bitmap from ckeckpoint pack */ + bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); + + dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dst_bitmap) + return -ENOMEM; + memcpy(dst_bitmap, src_bitmap, bitmap_size); + + /* init SIT information */ + sit_i->s_ops = &default_salloc_ops; + + sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); + sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; + sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->sit_bitmap = dst_bitmap; + sit_i->bitmap_size = bitmap_size; + sit_i->dirty_sentries = 0; + sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; + sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); + sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + mutex_init(&sit_i->sentry_lock); + return 0; +} + +static int build_free_segmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct free_segmap_info *free_i; + unsigned int bitmap_size, sec_bitmap_size; + + /* allocate memory for free segmap information */ + free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + if (!free_i) + return -ENOMEM; + + SM_I(sbi)->free_info = free_i; + + bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + if (!free_i->free_segmap) + return -ENOMEM; + + sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); + free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + if (!free_i->free_secmap) + return -ENOMEM; + + /* set all segments as dirty temporarily */ + memset(free_i->free_segmap, 0xff, bitmap_size); + memset(free_i->free_secmap, 0xff, sec_bitmap_size); + + /* init free segmap information */ + free_i->start_segno = + (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr); + free_i->free_segments = 0; + free_i->free_sections = 0; + rwlock_init(&free_i->segmap_lock); + return 0; +} + +static int build_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array = NULL; + int i; + + array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + if (!array) + return -ENOMEM; + + SM_I(sbi)->curseg_array = array; + + for (i = 0; i < NR_CURSEG_TYPE; i++) { + mutex_init(&array[i].curseg_mutex); + array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (!array[i].sum_blk) + return -ENOMEM; + array[i].segno = NULL_SEGNO; + array[i].next_blkoff = 0; + } + return restore_curseg_summaries(sbi); +} + +static void build_sit_entries(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + unsigned int start; + + for (start = 0; start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + int i; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } + } + mutex_unlock(&curseg->curseg_mutex); + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); +got_it: + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } + } +} + +static void init_free_segmap(struct f2fs_sb_info *sbi) +{ + unsigned int start; + int type; + + for (start = 0; start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *sentry = get_seg_entry(sbi, start); + if (!sentry->valid_blocks) + __set_free(sbi, start); + } + + /* set use the current segments */ + for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { + struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); + } +} + +static void init_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno = 0, offset = 0; + unsigned short valid_blocks; + + while (segno < TOTAL_SEGS(sbi)) { + /* find dirty segment based on free segmap */ + segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); + if (segno >= TOTAL_SEGS(sbi)) + break; + offset = segno + 1; + valid_blocks = get_valid_blocks(sbi, segno, 0); + if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) + continue; + mutex_lock(&dirty_i->seglist_lock); + __locate_dirty_segment(sbi, segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + } +} + +static int init_victim_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + + dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) + return -ENOMEM; + return 0; +} + +static int build_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i; + unsigned int bitmap_size, i; + + /* allocate memory for dirty segments list information */ + dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + if (!dirty_i) + return -ENOMEM; + + SM_I(sbi)->dirty_info = dirty_i; + mutex_init(&dirty_i->seglist_lock); + + bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + + for (i = 0; i < NR_DIRTY_TYPE; i++) { + dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->nr_dirty[i] = 0; + if (!dirty_i->dirty_segmap[i]) + return -ENOMEM; + } + + init_dirty_segmap(sbi); + return init_victim_segmap(sbi); +} + +/** + * Update min, max modified time for cost-benefit GC algorithm + */ +static void init_min_max_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno; + + mutex_lock(&sit_i->sentry_lock); + + sit_i->min_mtime = LLONG_MAX; + + for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + unsigned int i; + unsigned long long mtime = 0; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, segno + i)->mtime; + + mtime = div_u64(mtime, sbi->segs_per_sec); + + if (sit_i->min_mtime > mtime) + sit_i->min_mtime = mtime; + } + sit_i->max_mtime = get_mtime(sbi); + mutex_unlock(&sit_i->sentry_lock); +} + +int build_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_sm_info *sm_info = NULL; + int err; + + sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + if (!sm_info) + return -ENOMEM; + + /* init sm info */ + sbi->sm_info = sm_info; + INIT_LIST_HEAD(&sm_info->wblist_head); + spin_lock_init(&sm_info->wblist_lock); + sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + sm_info->segment_count = le32_to_cpu(raw_super->segment_count); + sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); + sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + + err = build_sit_info(sbi); + if (err) + return err; + err = build_free_segmap(sbi); + if (err) + return err; + err = build_curseg(sbi); + if (err) + return err; + + /* reinit free segmap based on SIT */ + build_sit_entries(sbi); + + init_free_segmap(sbi); + err = build_dirty_segmap(sbi); + if (err) + return err; + + init_min_max_mtime(sbi); + return 0; +} + +static void discard_dirty_segmap(struct f2fs_sb_info *sbi, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + mutex_lock(&dirty_i->seglist_lock); + kfree(dirty_i->dirty_segmap[dirty_type]); + dirty_i->nr_dirty[dirty_type] = 0; + mutex_unlock(&dirty_i->seglist_lock); +} + +void reset_victim_segmap(struct f2fs_sb_info *sbi) +{ + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); +} + +static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + kfree(dirty_i->victim_segmap[FG_GC]); + kfree(dirty_i->victim_segmap[BG_GC]); +} + +static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + if (!dirty_i) + return; + + /* discard pre-free/dirty segments list */ + for (i = 0; i < NR_DIRTY_TYPE; i++) + discard_dirty_segmap(sbi, i); + + destroy_victim_segmap(sbi); + SM_I(sbi)->dirty_info = NULL; + kfree(dirty_i); +} + +static void destroy_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array = SM_I(sbi)->curseg_array; + int i; + + if (!array) + return; + SM_I(sbi)->curseg_array = NULL; + for (i = 0; i < NR_CURSEG_TYPE; i++) + kfree(array[i].sum_blk); + kfree(array); +} + +static void destroy_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) + return; + SM_I(sbi)->free_info = NULL; + kfree(free_i->free_segmap); + kfree(free_i->free_secmap); + kfree(free_i); +} + +static void destroy_sit_info(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int start; + + if (!sit_i) + return; + + if (sit_i->sentries) { + for (start = 0; start < TOTAL_SEGS(sbi); start++) { + kfree(sit_i->sentries[start].cur_valid_map); + kfree(sit_i->sentries[start].ckpt_valid_map); + } + } + vfree(sit_i->sentries); + vfree(sit_i->sec_entries); + kfree(sit_i->dirty_sentries_bitmap); + + SM_I(sbi)->sit_info = NULL; + kfree(sit_i->sit_bitmap); + kfree(sit_i); +} + +void destroy_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + destroy_dirty_segmap(sbi); + destroy_curseg(sbi); + destroy_free_segmap(sbi); + destroy_sit_info(sbi); + sbi->sm_info = NULL; + kfree(sm_info); +} -- cgit v1.2.1 From fbfa2cc58d5363f780f4f2ad0243a47185c2bb2a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:09:44 +0900 Subject: f2fs: add file operations This adds memory operations and file/file_inode operations. - F2FS supports fallocate(), mmap(), fsync(), and basic ioctl(). Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 637 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 637 insertions(+) create mode 100644 fs/f2fs/file.c (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c new file mode 100644 index 000000000000..f5ae36d19f4f --- /dev/null +++ b/fs/f2fs/file.c @@ -0,0 +1,637 @@ +/** + * fs/f2fs/file.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" + +static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *node_page; + block_t old_blk_addr; + struct dnode_of_data dn; + int err; + + f2fs_balance_fs(sbi); + + sb_start_pagefault(inode->i_sb); + + mutex_lock_op(sbi, DATA_NEW); + + /* block allocation */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, page->index, 0); + if (err) { + mutex_unlock_op(sbi, DATA_NEW); + goto out; + } + + old_blk_addr = dn.data_blkaddr; + node_page = dn.node_page; + + if (old_blk_addr == NULL_ADDR) { + err = reserve_new_block(&dn); + if (err) { + f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, DATA_NEW); + goto out; + } + } + f2fs_put_dnode(&dn); + + mutex_unlock_op(sbi, DATA_NEW); + + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || + !PageUptodate(page)) { + unlock_page(page); + err = -EFAULT; + goto out; + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto out; + + /* fill the page */ + wait_on_page_writeback(page); + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + unsigned offset; + offset = i_size_read(inode) & ~PAGE_CACHE_MASK; + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + } + set_page_dirty(page); + SetPageUptodate(page); + + file_update_time(vma->vm_file); +out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(err); +} + +static const struct vm_operations_struct f2fs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, +}; + +static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) +{ + struct dentry *dentry; + nid_t pino; + + inode = igrab(inode); + dentry = d_find_any_alias(inode); + if (!dentry) { + iput(inode); + return 0; + } + pino = dentry->d_parent->d_inode->i_ino; + dput(dentry); + iput(inode); + return !is_checkpointed_node(sbi, pino); +} + +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + unsigned long long cur_version; + int ret = 0; + bool need_cp = false; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + if (inode->i_sb->s_flags & MS_RDONLY) + goto out; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + + mutex_lock(&sbi->cp_mutex); + cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); + mutex_unlock(&sbi->cp_mutex); + + if (F2FS_I(inode)->data_version != cur_version && + !(inode->i_state & I_DIRTY)) + goto out; + F2FS_I(inode)->data_version--; + + if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) + need_cp = true; + if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + need_cp = true; + if (!space_for_roll_forward(sbi)) + need_cp = true; + if (need_to_sync_dir(sbi, inode)) + need_cp = true; + + f2fs_write_inode(inode, NULL); + + if (need_cp) { + /* all the dirty node pages should be flushed for POR */ + ret = f2fs_sync_fs(inode->i_sb, 1); + clear_inode_flag(F2FS_I(inode), FI_NEED_CP); + } else { + while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0) + f2fs_write_inode(inode, NULL); + filemap_fdatawait_range(sbi->node_inode->i_mapping, + 0, LONG_MAX); + } +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &f2fs_file_vm_ops; + return 0; +} + +static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ + int nr_free = 0, ofs = dn->ofs_in_node; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_node *raw_node; + __le32 *addr; + + raw_node = page_address(dn->node_page); + addr = blkaddr_in_node(raw_node) + ofs; + + for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + block_t blkaddr = le32_to_cpu(*addr); + if (blkaddr == NULL_ADDR) + continue; + + update_extent_cache(NULL_ADDR, dn); + invalidate_blocks(sbi, blkaddr); + dec_valid_block_count(sbi, dn->inode, 1); + nr_free++; + } + if (nr_free) { + set_page_dirty(dn->node_page); + sync_inode_page(dn); + } + dn->ofs_in_node = ofs; + return nr_free; +} + +void truncate_data_blocks(struct dnode_of_data *dn) +{ + truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); +} + +static void truncate_partial_data_page(struct inode *inode, u64 from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE - 1); + struct page *page; + + if (!offset) + return; + + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) + return; + + lock_page(page); + wait_on_page_writeback(page); + zero_user(page, offset, PAGE_CACHE_SIZE - offset); + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static int truncate_blocks(struct inode *inode, u64 from) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + unsigned int blocksize = inode->i_sb->s_blocksize; + struct dnode_of_data dn; + pgoff_t free_from; + int count = 0; + int err; + + free_from = (pgoff_t) + ((from + blocksize - 1) >> (sbi->log_blocksize)); + + mutex_lock_op(sbi, DATA_TRUNC); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); + if (err) { + if (err == -ENOENT) + goto free_next; + mutex_unlock_op(sbi, DATA_TRUNC); + return err; + } + + if (IS_INODE(dn.node_page)) + count = ADDRS_PER_INODE; + else + count = ADDRS_PER_BLOCK; + + count -= dn.ofs_in_node; + BUG_ON(count < 0); + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { + truncate_data_blocks_range(&dn, count); + free_from += count; + } + + f2fs_put_dnode(&dn); +free_next: + err = truncate_inode_blocks(inode, free_from); + mutex_unlock_op(sbi, DATA_TRUNC); + + /* lastly zero out the first data page */ + truncate_partial_data_page(inode, from); + + return err; +} + +void f2fs_truncate(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + + if (!truncate_blocks(inode, i_size_read(inode))) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } + + f2fs_balance_fs(F2FS_SB(inode->i_sb)); +} + +static int f2fs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->blocks <<= 3; + return 0; +} + +#ifdef CONFIG_F2FS_FS_POSIX_ACL +static void __setattr_copy(struct inode *inode, const struct iattr *attr) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + set_acl_inode(fi, mode); + } +} +#else +#define __setattr_copy setattr_copy +#endif + +int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + f2fs_truncate(inode); + } + + __setattr_copy(inode, attr); + + if (attr->ia_valid & ATTR_MODE) { + err = f2fs_acl_chmod(inode); + if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { + inode->i_mode = fi->i_acl_mode; + clear_inode_flag(fi, FI_ACL_MODE); + } + } + + mark_inode_dirty(inode); + return err; +} + +const struct inode_operations f2fs_file_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +static void fill_zero(struct inode *inode, pgoff_t index, + loff_t start, loff_t len) +{ + struct page *page; + + if (!len) + return; + + page = get_new_data_page(inode, index, false); + + if (!IS_ERR(page)) { + wait_on_page_writeback(page); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +{ + pgoff_t index; + int err; + + for (index = pg_start; index < pg_end; index++) { + struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + mutex_lock_op(sbi, DATA_TRUNC); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, RDONLY_NODE); + if (err) { + mutex_unlock_op(sbi, DATA_TRUNC); + if (err == -ENOENT) + continue; + return err; + } + + if (dn.data_blkaddr != NULL_ADDR) + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, DATA_TRUNC); + } + return 0; +} + +static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +{ + pgoff_t pg_start, pg_end; + loff_t off_start, off_end; + int ret = 0; + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + if (pg_start == pg_end) { + fill_zero(inode, pg_start, off_start, + off_end - off_start); + } else { + if (off_start) + fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (off_end) + fill_zero(inode, pg_end, 0, off_end); + + if (pg_start < pg_end) { + struct address_space *mapping = inode->i_mapping; + loff_t blk_start, blk_end; + + blk_start = pg_start << PAGE_CACHE_SHIFT; + blk_end = pg_end << PAGE_CACHE_SHIFT; + truncate_inode_pages_range(mapping, blk_start, + blk_end - 1); + ret = truncate_hole(inode, pg_start, pg_end); + } + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + i_size_read(inode) <= (offset + len)) { + i_size_write(inode, offset); + mark_inode_dirty(inode); + } + + return ret; +} + +static int expand_inode_data(struct inode *inode, loff_t offset, + loff_t len, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + for (index = pg_start; index <= pg_end; index++) { + struct dnode_of_data dn; + + mutex_lock_op(sbi, DATA_NEW); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, 0); + if (ret) { + mutex_unlock_op(sbi, DATA_NEW); + break; + } + + if (dn.data_blkaddr == NULL_ADDR) { + ret = reserve_new_block(&dn); + if (ret) { + f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, DATA_NEW); + break; + } + } + f2fs_put_dnode(&dn); + + mutex_unlock_op(sbi, DATA_NEW); + + if (pg_start == pg_end) + new_size = offset + len; + else if (index == pg_start && off_start) + new_size = (index + 1) << PAGE_CACHE_SHIFT; + else if (index == pg_end) + new_size = (index << PAGE_CACHE_SHIFT) + off_end; + else + new_size += PAGE_CACHE_SIZE; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + i_size_read(inode) < new_size) { + i_size_write(inode, new_size); + mark_inode_dirty(inode); + } + + return ret; +} + +static long f2fs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + long ret; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = punch_hole(inode, offset, len, mode); + else + ret = expand_inode_data(inode, offset, len, mode); + + f2fs_balance_fs(sbi); + return ret; +} + +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags; + int ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = fi->i_flags & FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case FS_IOC_SETFLAGS: + { + unsigned int oldflags; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out; + } + + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto out; + } + } + + flags = flags & FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + fi->i_flags = flags; + mutex_unlock(&inode->i_mutex); + + f2fs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); +out: + mnt_drop_write(filp->f_path.mnt); + return ret; + } + default: + return -ENOTTY; + } +} + +const struct file_operations f2fs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .open = generic_file_open, + .mmap = f2fs_file_mmap, + .fsync = f2fs_sync_file, + .fallocate = f2fs_fallocate, + .unlocked_ioctl = f2fs_ioctl, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; -- cgit v1.2.1 From eb47b8009dc969a3386c983bd5e798e9f690c5d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:10:12 +0900 Subject: f2fs: add address space operations for data This adds address space operations for data. - F2FS supports readpages(), writepages(), and direct_IO(). - Because of out-of-place writes, f2fs_direct_IO() does not write data in place. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 701 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 701 insertions(+) create mode 100644 fs/f2fs/data.c (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c new file mode 100644 index 000000000000..c2fd0a80db16 --- /dev/null +++ b/fs/f2fs/data.c @@ -0,0 +1,701 @@ +/** + * fs/f2fs/data.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +/** + * Lock ordering for the change of data block address: + * ->data_page + * ->node_page + * update block addresses in the node page + */ +static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) +{ + struct f2fs_node *rn; + __le32 *addr_array; + struct page *node_page = dn->node_page; + unsigned int ofs_in_node = dn->ofs_in_node; + + wait_on_page_writeback(node_page); + + rn = (struct f2fs_node *)page_address(node_page); + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[ofs_in_node] = cpu_to_le32(new_addr); + set_page_dirty(node_page); +} + +int reserve_new_block(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + + if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + return -EPERM; + if (!inc_valid_block_count(sbi, dn->inode, 1)) + return -ENOSPC; + + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; + sync_inode_page(dn); + return 0; +} + +static int check_extent_cache(struct inode *inode, pgoff_t pgofs, + struct buffer_head *bh_result) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + pgoff_t start_fofs, end_fofs; + block_t start_blkaddr; + + read_lock(&fi->ext.ext_lock); + if (fi->ext.len == 0) { + read_unlock(&fi->ext.ext_lock); + return 0; + } + + sbi->total_hit_ext++; + start_fofs = fi->ext.fofs; + end_fofs = fi->ext.fofs + fi->ext.len - 1; + start_blkaddr = fi->ext.blk_addr; + + if (pgofs >= start_fofs && pgofs <= end_fofs) { + unsigned int blkbits = inode->i_sb->s_blocksize_bits; + size_t count; + + clear_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, + start_blkaddr + pgofs - start_fofs); + count = end_fofs - pgofs + 1; + if (count < (UINT_MAX >> blkbits)) + bh_result->b_size = (count << blkbits); + else + bh_result->b_size = UINT_MAX; + + sbi->read_hit_ext++; + read_unlock(&fi->ext.ext_lock); + return 1; + } + read_unlock(&fi->ext.ext_lock); + return 0; +} + +void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs, start_fofs, end_fofs; + block_t start_blkaddr, end_blkaddr; + + BUG_ON(blk_addr == NEW_ADDR); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + + /* Update the page address in the parent node */ + __set_data_blkaddr(dn, blk_addr); + + write_lock(&fi->ext.ext_lock); + + start_fofs = fi->ext.fofs; + end_fofs = fi->ext.fofs + fi->ext.len - 1; + start_blkaddr = fi->ext.blk_addr; + end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; + + /* Drop and initialize the matched extent */ + if (fi->ext.len == 1 && fofs == start_fofs) + fi->ext.len = 0; + + /* Initial extent */ + if (fi->ext.len == 0) { + if (blk_addr != NULL_ADDR) { + fi->ext.fofs = fofs; + fi->ext.blk_addr = blk_addr; + fi->ext.len = 1; + } + goto end_update; + } + + /* Frone merge */ + if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { + fi->ext.fofs--; + fi->ext.blk_addr--; + fi->ext.len++; + goto end_update; + } + + /* Back merge */ + if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { + fi->ext.len++; + goto end_update; + } + + /* Split the existing extent */ + if (fi->ext.len > 1 && + fofs >= start_fofs && fofs <= end_fofs) { + if ((end_fofs - fofs) < (fi->ext.len >> 1)) { + fi->ext.len = fofs - start_fofs; + } else { + fi->ext.fofs = fofs + 1; + fi->ext.blk_addr = start_blkaddr + + fofs - start_fofs + 1; + fi->ext.len -= fofs - start_fofs + 1; + } + goto end_update; + } + write_unlock(&fi->ext.ext_lock); + return; + +end_update: + write_unlock(&fi->ext.ext_lock); + sync_inode_page(dn); + return; +} + +struct page *find_data_page(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + int err; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, RDONLY_NODE); + if (err) + return ERR_PTR(err); + f2fs_put_dnode(&dn); + + if (dn.data_blkaddr == NULL_ADDR) + return ERR_PTR(-ENOENT); + + /* By fallocate(), there is no cached page, but with NEW_ADDR */ + if (dn.data_blkaddr == NEW_ADDR) + return ERR_PTR(-EINVAL); + + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + + err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + unlock_page(page); + return page; +} + +/** + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, RDONLY_NODE); + if (err) + return ERR_PTR(err); + f2fs_put_dnode(&dn); + + if (dn.data_blkaddr == NULL_ADDR) + return ERR_PTR(-ENOENT); + + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + + if (PageUptodate(page)) + return page; + + BUG_ON(dn.data_blkaddr == NEW_ADDR); + BUG_ON(dn.data_blkaddr == NULL_ADDR); + + err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + return page; +} + +/** + * Caller ensures that this data page is never allocated. + * A new zero-filled data page is allocated in the page cache. + */ +struct page *get_new_data_page(struct inode *inode, pgoff_t index, + bool new_i_size) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, 0); + if (err) + return ERR_PTR(err); + + if (dn.data_blkaddr == NULL_ADDR) { + if (reserve_new_block(&dn)) { + f2fs_put_dnode(&dn); + return ERR_PTR(-ENOSPC); + } + } + f2fs_put_dnode(&dn); + + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + + if (PageUptodate(page)) + return page; + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + } else { + err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + } + SetPageUptodate(page); + + if (new_i_size && + i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + mark_inode_dirty_sync(inode); + } + return page; +} + +static void read_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } while (bvec >= bio->bi_io_vec); + kfree(bio->bi_private); + bio_put(bio); +} + +/** + * Fill the locked page with data located in the block address. + * Read operation is synchronous, and caller must unlock the page. + */ +int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int type) +{ + struct block_device *bdev = sbi->sb->s_bdev; + bool sync = (type == READ_SYNC); + struct bio *bio; + + /* This page can be already read by other threads */ + if (PageUptodate(page)) { + if (!sync) + unlock_page(page); + return 0; + } + + down_read(&sbi->bio_sem); + + /* Allocate a new bio */ + bio = f2fs_bio_alloc(bdev, blk_addr << (sbi->log_blocksize - 9), + 1, GFP_NOFS | __GFP_HIGH); + + /* Initialize the bio */ + bio->bi_end_io = read_end_io; + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + kfree(bio->bi_private); + bio_put(bio); + up_read(&sbi->bio_sem); + return -EFAULT; + } + + submit_bio(type, bio); + up_read(&sbi->bio_sem); + + /* wait for read completion if sync */ + if (sync) { + lock_page(page); + if (PageError(page)) + return -EIO; + } + return 0; +} + +/** + * This function should be used by the data read flow only where it + * does not check the "create" flag that indicates block allocation. + * The reason for this special functionality is to exploit VFS readahead + * mechanism. + */ +static int get_data_block_ro(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + unsigned int blkbits = inode->i_sb->s_blocksize_bits; + unsigned maxblocks = bh_result->b_size >> blkbits; + struct dnode_of_data dn; + pgoff_t pgofs; + int err; + + /* Get the page offset from the block offset(iblock) */ + pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + + if (check_extent_cache(inode, pgofs, bh_result)) + return 0; + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); + if (err) + return (err == -ENOENT) ? 0 : err; + + /* It does not support data allocation */ + BUG_ON(create); + + if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { + int i; + unsigned int end_offset; + + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE : + ADDRS_PER_BLOCK; + + clear_buffer_new(bh_result); + + /* Give more consecutive addresses for the read ahead */ + for (i = 0; i < end_offset - dn.ofs_in_node; i++) + if (((datablock_addr(dn.node_page, + dn.ofs_in_node + i)) + != (dn.data_blkaddr + i)) || maxblocks == i) + break; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + bh_result->b_size = (i << blkbits); + } + f2fs_put_dnode(&dn); + return 0; +} + +static int f2fs_read_data_page(struct file *file, struct page *page) +{ + return mpage_readpage(page, get_data_block_ro); +} + +static int f2fs_read_data_pages(struct file *file, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); +} + +int do_write_data_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + block_t old_blk_addr, new_blk_addr; + struct dnode_of_data dn; + int err = 0; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); + if (err) + return err; + + old_blk_addr = dn.data_blkaddr; + + /* This page is already truncated */ + if (old_blk_addr == NULL_ADDR) + goto out_writepage; + + set_page_writeback(page); + + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ + if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && + need_inplace_update(inode)) { + rewrite_data_page(F2FS_SB(inode->i_sb), page, + old_blk_addr); + } else { + write_data_page(inode, page, &dn, + old_blk_addr, &new_blk_addr); + update_extent_cache(new_blk_addr, &dn); + F2FS_I(inode)->data_version = + le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); + } +out_writepage: + f2fs_put_dnode(&dn); + return err; +} + +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = ((unsigned long long) i_size) + >> PAGE_CACHE_SHIFT; + unsigned offset; + int err = 0; + + if (page->index < end_index) + goto out; + + /* + * If the offset is out-of-range of file size, + * this page does not have to be written to disk. + */ + offset = i_size & (PAGE_CACHE_SIZE - 1); + if ((page->index >= end_index + 1) || !offset) { + if (S_ISDIR(inode->i_mode)) { + dec_page_count(sbi, F2FS_DIRTY_DENTS); + inode_dec_dirty_dents(inode); + } + goto unlock_out; + } + + zero_user_segment(page, offset, PAGE_CACHE_SIZE); +out: + if (sbi->por_doing) + goto redirty_out; + + if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) + goto redirty_out; + + mutex_lock_op(sbi, DATA_WRITE); + if (S_ISDIR(inode->i_mode)) { + dec_page_count(sbi, F2FS_DIRTY_DENTS); + inode_dec_dirty_dents(inode); + } + err = do_write_data_page(page); + if (err && err != -ENOENT) { + wbc->pages_skipped++; + set_page_dirty(page); + } + mutex_unlock_op(sbi, DATA_WRITE); + + if (wbc->for_reclaim) + f2fs_submit_bio(sbi, DATA, true); + + if (err == -ENOENT) + goto unlock_out; + + clear_cold_data(page); + unlock_page(page); + + if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi); + return 0; + +unlock_out: + unlock_page(page); + return (err == -ENOENT) ? 0 : err; + +redirty_out: + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; +} + +#define MAX_DESIRED_PAGES_WP 4096 + +int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ret; + long excess_nrtw = 0, desired_nrtw; + + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { + desired_nrtw = MAX_DESIRED_PAGES_WP; + excess_nrtw = desired_nrtw - wbc->nr_to_write; + wbc->nr_to_write = desired_nrtw; + } + + if (!S_ISDIR(inode->i_mode)) + mutex_lock(&sbi->writepages); + ret = generic_writepages(mapping, wbc); + if (!S_ISDIR(inode->i_mode)) + mutex_unlock(&sbi->writepages); + f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + remove_dirty_dir_inode(inode); + + wbc->nr_to_write -= excess_nrtw; + return ret; +} + +static int f2fs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; + struct dnode_of_data dn; + int err = 0; + + /* for nobh_write_end */ + *fsdata = NULL; + + f2fs_balance_fs(sbi); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + mutex_lock_op(sbi, DATA_NEW); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, index, 0); + if (err) { + mutex_unlock_op(sbi, DATA_NEW); + f2fs_put_page(page, 1); + return err; + } + + if (dn.data_blkaddr == NULL_ADDR) { + err = reserve_new_block(&dn); + if (err) { + f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, DATA_NEW); + f2fs_put_page(page, 1); + return err; + } + } + f2fs_put_dnode(&dn); + + mutex_unlock_op(sbi, DATA_NEW); + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + } else { + err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (err) { + f2fs_put_page(page, 1); + return err; + } + } + SetPageUptodate(page); + clear_cold_data(page); + return 0; +} + +static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + if (rw == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + get_data_block_ro); +} + +static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + if (S_ISDIR(inode->i_mode) && PageDirty(page)) { + dec_page_count(sbi, F2FS_DIRTY_DENTS); + inode_dec_dirty_dents(inode); + } + ClearPagePrivate(page); +} + +static int f2fs_release_data_page(struct page *page, gfp_t wait) +{ + ClearPagePrivate(page); + return 0; +} + +static int f2fs_set_data_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + set_dirty_dir_page(inode, page); + return 1; + } + return 0; +} + +const struct address_space_operations f2fs_dblock_aops = { + .readpage = f2fs_read_data_page, + .readpages = f2fs_read_data_pages, + .writepage = f2fs_write_data_page, + .writepages = f2fs_write_data_pages, + .write_begin = f2fs_write_begin, + .write_end = nobh_write_end, + .set_page_dirty = f2fs_set_data_page_dirty, + .invalidatepage = f2fs_invalidate_data_page, + .releasepage = f2fs_release_data_page, + .direct_IO = f2fs_direct_IO, +}; -- cgit v1.2.1 From 19f99cee206cd4fe2e84176001bfd0b8b9dd4b42 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:10:40 +0900 Subject: f2fs: add core inode operations This adds core functions to get, read, write, and evict an inode. Signed-off-by: Changman Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 fs/f2fs/inode.c (limited to 'fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c new file mode 100644 index 000000000000..94f13d2815e9 --- /dev/null +++ b/fs/f2fs/inode.c @@ -0,0 +1,266 @@ +/** + * fs/f2fs/inode.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" + +struct f2fs_iget_args { + u64 ino; + int on_free; +}; + +void f2fs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = F2FS_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | + S_NOATIME | S_DIRSYNC); + + if (flags & FS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +static int f2fs_iget_test(struct inode *inode, void *data) +{ + struct f2fs_iget_args *args = data; + + if (inode->i_ino != args->ino) + return 0; + if (inode->i_state & (I_FREEING | I_WILL_FREE)) { + args->on_free = 1; + return 0; + } + return 1; +} + +struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino) +{ + struct f2fs_iget_args args = { + .ino = ino, + .on_free = 0 + }; + struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args); + + if (inode) + return inode; + if (!args.on_free) + return f2fs_iget(sb, ino); + return ERR_PTR(-ENOENT); +} + +static int do_read_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct page *node_page; + struct f2fs_node *rn; + struct f2fs_inode *ri; + + /* Check if ino is within scope */ + check_nid_range(sbi, inode->i_ino); + + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + rn = page_address(node_page); + ri = &(rn->i); + + inode->i_mode = le16_to_cpu(ri->i_mode); + i_uid_write(inode, le32_to_cpu(ri->i_uid)); + i_gid_write(inode, le32_to_cpu(ri->i_gid)); + set_nlink(inode, le32_to_cpu(ri->i_links)); + inode->i_size = le64_to_cpu(ri->i_size); + inode->i_blocks = le64_to_cpu(ri->i_blocks); + + inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode->i_generation = le32_to_cpu(ri->i_generation); + + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); + fi->i_flags = le32_to_cpu(ri->i_flags); + fi->flags = 0; + fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; + fi->i_advise = ri->i_advise; + get_extent_info(&fi->ext, ri->i_ext); + f2fs_put_page(node_page, 1); + return 0; +} + +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) + goto make_now; + + ret = do_read_inode(inode); + if (ret) + goto bad_inode; + + if (!sbi->por_doing && inode->i_nlink == 0) { + ret = -ENOENT; + goto bad_inode; + } + +make_now: + if (ino == F2FS_NODE_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_node_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + } else if (ino == F2FS_META_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | + __GFP_ZERO); + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &f2fs_symlink_inode_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &f2fs_special_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + ret = -EIO; + goto bad_inode; + } + unlock_new_inode(inode); + + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +void update_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_node *rn; + struct f2fs_inode *ri; + + wait_on_page_writeback(node_page); + + rn = page_address(node_page); + ri = &(rn->i); + + ri->i_mode = cpu_to_le16(inode->i_mode); + ri->i_advise = F2FS_I(inode)->i_advise; + ri->i_uid = cpu_to_le32(i_uid_read(inode)); + ri->i_gid = cpu_to_le32(i_gid_read(inode)); + ri->i_links = cpu_to_le32(inode->i_nlink); + ri->i_size = cpu_to_le64(i_size_read(inode)); + ri->i_blocks = cpu_to_le64(inode->i_blocks); + set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + + ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); + ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); + ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); + ri->i_generation = cpu_to_le32(inode->i_generation); + set_page_dirty(node_page); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *node_page; + bool need_lock = false; + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + if (!PageDirty(node_page)) { + need_lock = true; + f2fs_put_page(node_page, 1); + mutex_lock(&sbi->write_inode); + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) { + mutex_unlock(&sbi->write_inode); + return PTR_ERR(node_page); + } + } + update_inode(inode, node_page); + f2fs_put_page(node_page, 1); + if (need_lock) + mutex_unlock(&sbi->write_inode); + return 0; +} + +/** + * Called at the last iput() if i_nlink is zero + */ +void f2fs_evict_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + truncate_inode_pages(&inode->i_data, 0); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + goto no_delete; + + BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); + remove_dirty_dir_inode(inode); + + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); + i_size_write(inode, 0); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + remove_inode_page(inode); +no_delete: + clear_inode(inode); +} -- cgit v1.2.1 From 57397d86c62dfee7bf1d60c9960201c78a9c4ec2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:11:10 +0900 Subject: f2fs: add inode operations for special inodes This adds inode operations for directory, symlink, and special inodes. Signed-off-by: Changman Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 504 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 504 insertions(+) create mode 100644 fs/f2fs/namei.c (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c new file mode 100644 index 000000000000..aec362f6f0b0 --- /dev/null +++ b/fs/f2fs/namei.c @@ -0,0 +1,504 @@ +/** + * fs/f2fs/namei.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + nid_t ino; + struct inode *inode; + bool nid_free = false; + int err; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + mutex_lock_op(sbi, NODE_NEW); + if (!alloc_nid(sbi, &ino)) { + mutex_unlock_op(sbi, NODE_NEW); + err = -ENOSPC; + goto fail; + } + mutex_unlock_op(sbi, NODE_NEW); + + inode->i_uid = current_fsuid(); + + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); + } + + inode->i_ino = ino; + inode->i_mode = mode; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_generation = sbi->s_next_generation++; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + nid_free = true; + goto out; + } + + mark_inode_dirty(inode); + return inode; + +out: + clear_nlink(inode); + unlock_new_inode(inode); +fail: + iput(inode); + if (nid_free) + alloc_nid_failed(sbi, ino); + return ERR_PTR(err); +} + +static int is_multimedia_file(const unsigned char *s, const char *sub) +{ + int slen = strlen(s); + int sublen = strlen(sub); + int ret; + + if (sublen > slen) + return 1; + + ret = memcmp(s + slen - sublen, sub, sublen); + if (ret) { /* compare upper case */ + int i; + char upper_sub[8]; + for (i = 0; i < sublen && i < sizeof(upper_sub); i++) + upper_sub[i] = toupper(sub[i]); + return memcmp(s + slen - sublen, upper_sub, sublen); + } + + return ret; +} + +/** + * Set multimedia files as cold files for hot/cold data separation + */ +static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + int i; + __u8 (*extlist)[8] = sbi->raw_super->extension_list; + + int count = le32_to_cpu(sbi->raw_super->extension_count); + for (i = 0; i < count; i++) { + if (!is_multimedia_file(name, extlist[i])) { + F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + break; + } + } +} + +static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + nid_t ino = 0; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_cold_file(sbi, inode, dentry->d_name.name); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + ino = inode->i_ino; + + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + + alloc_nid_done(sbi, ino); + + if (!sbi->por_doing) + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + f2fs_balance_fs(sbi); + return 0; +out: + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + alloc_nid_failed(sbi, ino); + return err; +} + +static int f2fs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + inode->i_ctime = CURRENT_TIME; + atomic_inc(&inode->i_count); + + set_inode_flag(F2FS_I(inode), FI_INC_LINK); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + + d_instantiate(dentry, inode); + + f2fs_balance_fs(sbi); + return 0; +out: + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + iput(inode); + return err; +} + +struct dentry *f2fs_get_parent(struct dentry *child) +{ + struct qstr dotdot = QSTR_INIT("..", 2); + unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); +} + +static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct f2fs_dir_entry *de; + struct page *page; + + if (dentry->d_name.len > F2FS_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (de) { + nid_t ino = le32_to_cpu(de->ino); + kunmap(page); + f2fs_put_page(page, 0); + + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); +} + +static int f2fs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode = dentry->d_inode; + struct f2fs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto fail; + + err = check_orphan_space(sbi); + if (err) { + kunmap(page); + f2fs_put_page(page, 0); + goto fail; + } + + f2fs_delete_entry(de, page, inode); + + /* In order to evict this inode, we set it dirty */ + mark_inode_dirty(inode); + f2fs_balance_fs(sbi); +fail: + return err; +} + +static int f2fs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + unsigned symlen = strlen(symname) + 1; + int err; + + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_symlink_inode_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + + err = page_symlink(inode, symname, symlen); + alloc_nid_done(sbi, inode->i_ino); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + f2fs_balance_fs(sbi); + + return err; +out: + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + +static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct inode *inode; + int err; + + inode = f2fs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + return err; + + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_ZERO); + + set_inode_flag(F2FS_I(inode), FI_INC_LINK); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_fail; + + alloc_nid_done(sbi, inode->i_ino); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + f2fs_balance_fs(sbi); + return 0; + +out_fail: + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + +static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + if (f2fs_empty_dir(inode)) + return f2fs_unlink(dir, dentry); + return -ENOTEMPTY; +} + +static int f2fs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + int err = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &f2fs_special_inode_operations; + + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + + alloc_nid_done(sbi, inode->i_ino); + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + f2fs_balance_fs(sbi); + + return 0; +out: + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + +static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *old_dir_page; + struct page *old_page; + struct f2fs_dir_entry *old_dir_entry = NULL; + struct f2fs_dir_entry *old_entry; + struct f2fs_dir_entry *new_entry; + int err = -ENOENT; + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); + if (!old_dir_entry) + goto out_old; + } + + mutex_lock_op(sbi, RENAME); + + if (new_inode) { + struct page *new_page; + + err = -ENOTEMPTY; + if (old_dir_entry && !f2fs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, + &new_page); + if (!new_entry) + goto out_dir; + + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + new_inode->i_ctime = CURRENT_TIME; + if (old_dir_entry) + drop_nlink(new_inode); + drop_nlink(new_inode); + if (!new_inode->i_nlink) + add_orphan_inode(sbi, new_inode->i_ino); + f2fs_write_inode(new_inode, NULL); + } else { + err = f2fs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + + if (old_dir_entry) { + inc_nlink(new_dir); + f2fs_write_inode(new_dir, NULL); + } + } + + old_inode->i_ctime = CURRENT_TIME; + set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); + mark_inode_dirty(old_inode); + + f2fs_delete_entry(old_entry, old_page, NULL); + + if (old_dir_entry) { + if (old_dir != new_dir) { + f2fs_set_link(old_inode, old_dir_entry, + old_dir_page, new_dir); + } else { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } + drop_nlink(old_dir); + f2fs_write_inode(old_dir, NULL); + } + + mutex_unlock_op(sbi, RENAME); + + f2fs_balance_fs(sbi); + return 0; + +out_dir: + if (old_dir_entry) { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } + mutex_unlock_op(sbi, RENAME); +out_old: + kunmap(old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +const struct inode_operations f2fs_dir_inode_operations = { + .create = f2fs_create, + .lookup = f2fs_lookup, + .link = f2fs_link, + .unlink = f2fs_unlink, + .symlink = f2fs_symlink, + .mkdir = f2fs_mkdir, + .rmdir = f2fs_rmdir, + .mknod = f2fs_mknod, + .rename = f2fs_rename, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations f2fs_special_inode_operations = { + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +#endif +}; -- cgit v1.2.1 From 6b4ea0160ae236a6561defa28e19f973aedda9ff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Nov 2012 16:59:04 +0900 Subject: f2fs: add core directory operations this adds core functions to find, add, delete, and link dentries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 672 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/hash.c | 98 +++++++++ 2 files changed, 770 insertions(+) create mode 100644 fs/f2fs/dir.c create mode 100644 fs/f2fs/hash.c (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c new file mode 100644 index 000000000000..5975568d03df --- /dev/null +++ b/fs/f2fs/dir.c @@ -0,0 +1,672 @@ +/** + * fs/f2fs/dir.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "f2fs.h" +#include "acl.h" + +static unsigned long dir_blocks(struct inode *inode) +{ + return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) + >> PAGE_CACHE_SHIFT; +} + +static unsigned int dir_buckets(unsigned int level) +{ + if (level < MAX_DIR_HASH_DEPTH / 2) + return 1 << level; + else + return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); +} + +static unsigned int bucket_blocks(unsigned int level) +{ + if (level < MAX_DIR_HASH_DEPTH / 2) + return 2; + else + return 4; +} + +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { + [F2FS_FT_UNKNOWN] = DT_UNKNOWN, + [F2FS_FT_REG_FILE] = DT_REG, + [F2FS_FT_DIR] = DT_DIR, + [F2FS_FT_CHRDEV] = DT_CHR, + [F2FS_FT_BLKDEV] = DT_BLK, + [F2FS_FT_FIFO] = DT_FIFO, + [F2FS_FT_SOCK] = DT_SOCK, + [F2FS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, + [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, +}; + +static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static unsigned long dir_block_index(unsigned int level, unsigned int idx) +{ + unsigned long i; + unsigned long bidx = 0; + + for (i = 0; i < level; i++) + bidx += dir_buckets(i) * bucket_blocks(i); + bidx += idx * bucket_blocks(level); + return bidx; +} + +static bool early_match_name(const char *name, int namelen, + f2fs_hash_t namehash, struct f2fs_dir_entry *de) +{ + if (le16_to_cpu(de->name_len) != namelen) + return false; + + if (le32_to_cpu(de->hash_code) != namehash) + return false; + + return true; +} + +static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, + const char *name, int namelen, int *max_slots, + f2fs_hash_t namehash, struct page **res_page) +{ + struct f2fs_dir_entry *de; + unsigned long bit_pos, end_pos, next_pos; + struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); + int slots; + + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, 0); + while (bit_pos < NR_DENTRY_IN_BLOCK) { + de = &dentry_blk->dentry[bit_pos]; + slots = (le16_to_cpu(de->name_len) + F2FS_NAME_LEN - 1) / + F2FS_NAME_LEN; + + if (early_match_name(name, namelen, namehash, de)) { + if (!memcmp(dentry_blk->filename[bit_pos], + name, namelen)) { + *res_page = dentry_page; + goto found; + } + } + next_pos = bit_pos + slots; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, next_pos); + if (bit_pos >= NR_DENTRY_IN_BLOCK) + end_pos = NR_DENTRY_IN_BLOCK; + else + end_pos = bit_pos; + if (*max_slots < end_pos - next_pos) + *max_slots = end_pos - next_pos; + } + + de = NULL; + kunmap(dentry_page); +found: + return de; +} + +static struct f2fs_dir_entry *find_in_level(struct inode *dir, + unsigned int level, const char *name, int namelen, + f2fs_hash_t namehash, struct page **res_page) +{ + int s = (namelen + F2FS_NAME_LEN - 1) / F2FS_NAME_LEN; + unsigned int nbucket, nblock; + unsigned int bidx, end_block; + struct page *dentry_page; + struct f2fs_dir_entry *de = NULL; + bool room = false; + int max_slots = 0; + + BUG_ON(level > MAX_DIR_HASH_DEPTH); + + nbucket = dir_buckets(level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, namehash % nbucket); + end_block = bidx + nblock; + + for (; bidx < end_block; bidx++) { + /* no need to allocate new dentry pages to all the indices */ + dentry_page = find_data_page(dir, bidx); + if (IS_ERR(dentry_page)) { + room = true; + continue; + } + + de = find_in_block(dentry_page, name, namelen, + &max_slots, namehash, res_page); + if (de) + break; + + if (max_slots >= s) + room = true; + f2fs_put_page(dentry_page, 0); + } + + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } + + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + struct qstr *child, struct page **res_page) +{ + const char *name = child->name; + int namelen = child->len; + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + f2fs_hash_t name_hash; + unsigned int max_depth; + unsigned int level; + + if (npages == 0) + return NULL; + + *res_page = NULL; + + name_hash = f2fs_dentry_hash(name, namelen); + max_depth = F2FS_I(dir)->i_current_depth; + + for (level = 0; level < max_depth; level++) { + de = find_in_level(dir, level, name, + namelen, name_hash, res_page); + if (de) + break; + } + if (!de && F2FS_I(dir)->chash != name_hash) { + F2FS_I(dir)->chash = name_hash; + F2FS_I(dir)->clevel = level - 1; + } + return de; +} + +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +{ + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; + struct f2fs_dentry_block *dentry_blk = NULL; + + page = get_lock_data_page(dir, 0); + if (IS_ERR(page)) + return NULL; + + dentry_blk = kmap(page); + de = &dentry_blk->dentry[1]; + *p = page; + unlock_page(page); + return de; +} + +ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +{ + ino_t res = 0; + struct f2fs_dir_entry *de; + struct page *page; + + de = f2fs_find_entry(dir, qstr, &page); + if (de) { + res = le32_to_cpu(de->ino); + kunmap(page); + f2fs_put_page(page, 0); + } + + return res; +} + +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + + mutex_lock_op(sbi, DENTRY_OPS); + lock_page(page); + wait_on_page_writeback(page); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode); + kunmap(page); + set_page_dirty(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + f2fs_put_page(page, 1); + mutex_unlock_op(sbi, DENTRY_OPS); +} + +void init_dent_inode(struct dentry *dentry, struct page *ipage) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct f2fs_node *rn; + + if (IS_ERR(ipage)) + return; + + wait_on_page_writeback(ipage); + + /* copy dentry info. to this inode page */ + rn = (struct f2fs_node *)page_address(ipage); + rn->i.i_pino = cpu_to_le32(dir->i_ino); + rn->i.i_namelen = cpu_to_le32(dentry->d_name.len); + memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len); + set_page_dirty(ipage); +} + +static int init_inode_metadata(struct inode *inode, struct dentry *dentry) +{ + struct inode *dir = dentry->d_parent->d_inode; + + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + int err; + err = new_inode_page(inode, dentry); + if (err) + return err; + + if (S_ISDIR(inode->i_mode)) { + err = f2fs_make_empty(inode, dir); + if (err) { + remove_inode_page(inode); + return err; + } + } + + err = f2fs_init_acl(inode, dir); + if (err) { + remove_inode_page(inode); + return err; + } + } else { + struct page *ipage; + ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + init_dent_inode(dentry, ipage); + f2fs_put_page(ipage, 1); + } + if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + inc_nlink(inode); + f2fs_write_inode(inode, NULL); + } + return 0; +} + +static void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth) +{ + bool need_dir_update = false; + + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) { + inc_nlink(dir); + need_dir_update = true; + } + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + } + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + if (F2FS_I(dir)->i_current_depth != current_depth) { + F2FS_I(dir)->i_current_depth = current_depth; + need_dir_update = true; + } + + if (need_dir_update) + f2fs_write_inode(dir, NULL); + else + mark_inode_dirty(dir); + + if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); +} + +static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots) +{ + int bit_start = 0; + int zero_start, zero_end; +next: + zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_start); + if (zero_start >= NR_DENTRY_IN_BLOCK) + return NR_DENTRY_IN_BLOCK; + + zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + zero_start); + if (zero_end - zero_start >= slots) + return zero_start; + + bit_start = zero_end + 1; + + if (zero_end + 1 >= NR_DENTRY_IN_BLOCK) + return NR_DENTRY_IN_BLOCK; + goto next; +} + +int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + unsigned int bit_pos; + unsigned int level; + unsigned int current_depth; + unsigned long bidx, block; + f2fs_hash_t dentry_hash; + struct f2fs_dir_entry *de; + unsigned int nbucket, nblock; + struct inode *dir = dentry->d_parent->d_inode; + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct page *dentry_page = NULL; + struct f2fs_dentry_block *dentry_blk = NULL; + int slots = (namelen + F2FS_NAME_LEN - 1) / F2FS_NAME_LEN; + int err = 0; + int i; + + dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len); + level = 0; + current_depth = F2FS_I(dir)->i_current_depth; + if (F2FS_I(dir)->chash == dentry_hash) { + level = F2FS_I(dir)->clevel; + F2FS_I(dir)->chash = 0; + } + +start: + if (current_depth == MAX_DIR_HASH_DEPTH) + return -ENOSPC; + + /* Increase the depth, if required */ + if (level == current_depth) + ++current_depth; + + nbucket = dir_buckets(level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, (dentry_hash % nbucket)); + + for (block = bidx; block <= (bidx + nblock - 1); block++) { + mutex_lock_op(sbi, DENTRY_OPS); + dentry_page = get_new_data_page(dir, block, true); + if (IS_ERR(dentry_page)) { + mutex_unlock_op(sbi, DENTRY_OPS); + return PTR_ERR(dentry_page); + } + + dentry_blk = kmap(dentry_page); + bit_pos = room_for_filename(dentry_blk, slots); + if (bit_pos < NR_DENTRY_IN_BLOCK) + goto add_dentry; + + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + mutex_unlock_op(sbi, DENTRY_OPS); + } + + /* Move to next level to find the empty slot for new dentry */ + ++level; + goto start; +add_dentry: + err = init_inode_metadata(inode, dentry); + if (err) + goto fail; + + wait_on_page_writeback(dentry_page); + + de = &dentry_blk->dentry[bit_pos]; + de->hash_code = cpu_to_le32(dentry_hash); + de->name_len = cpu_to_le16(namelen); + memcpy(dentry_blk->filename[bit_pos], name, namelen); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode); + for (i = 0; i < slots; i++) + test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + set_page_dirty(dentry_page); + update_parent_metadata(dir, inode, current_depth); +fail: + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + mutex_unlock_op(sbi, DENTRY_OPS); + return err; +} + +/** + * It only removes the dentry from the dentry page,corresponding name + * entry in name page does not need to be touched during deletion. + */ +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *inode) +{ + struct f2fs_dentry_block *dentry_blk; + unsigned int bit_pos; + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + int slots = (le16_to_cpu(dentry->name_len) + F2FS_NAME_LEN - 1) / + F2FS_NAME_LEN; + void *kaddr = page_address(page); + int i; + + mutex_lock_op(sbi, DENTRY_OPS); + + lock_page(page); + wait_on_page_writeback(page); + + dentry_blk = (struct f2fs_dentry_block *)kaddr; + bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + for (i = 0; i < slots; i++) + test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + /* Let's check and deallocate this dentry page */ + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + 0); + kunmap(page); /* kunmap - pair of f2fs_find_entry */ + set_page_dirty(page); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + if (inode && S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + f2fs_write_inode(dir, NULL); + } else { + mark_inode_dirty(dir); + } + + if (inode) { + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + drop_nlink(inode); + if (S_ISDIR(inode->i_mode)) { + drop_nlink(inode); + i_size_write(inode, 0); + } + f2fs_write_inode(inode, NULL); + if (inode->i_nlink == 0) + add_orphan_inode(sbi, inode->i_ino); + } + + if (bit_pos == NR_DENTRY_IN_BLOCK) { + loff_t page_offset; + truncate_hole(dir, page->index, page->index + 1); + clear_page_dirty_for_io(page); + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_DENTS); + inode_dec_dirty_dents(dir); + page_offset = page->index << PAGE_CACHE_SHIFT; + f2fs_put_page(page, 1); + } else { + f2fs_put_page(page, 1); + } + mutex_unlock_op(sbi, DENTRY_OPS); +} + +int f2fs_make_empty(struct inode *inode, struct inode *parent) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + void *kaddr; + + dentry_page = get_new_data_page(inode, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + + de = &dentry_blk->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(dentry_blk->filename[0], ".", 1); + set_de_type(de, inode); + + de = &dentry_blk->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(dentry_blk->filename[1], "..", 2); + set_de_type(de, inode); + + test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + kunmap_atomic(kaddr); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +bool f2fs_empty_dir(struct inode *dir) +{ + unsigned long bidx; + struct page *dentry_page; + unsigned int bit_pos; + struct f2fs_dentry_block *dentry_blk; + unsigned long nblock = dir_blocks(dir); + + for (bidx = 0; bidx < nblock; bidx++) { + void *kaddr; + dentry_page = get_lock_data_page(dir, bidx); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) + continue; + else + return false; + } + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + if (bidx == 0) + bit_pos = 2; + else + bit_pos = 0; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_pos); + kunmap_atomic(kaddr); + + f2fs_put_page(dentry_page, 1); + + if (bit_pos < NR_DENTRY_IN_BLOCK) + return false; + } + return true; +} + +static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + unsigned long pos = file->f_pos; + struct inode *inode = file->f_dentry->d_inode; + unsigned long npages = dir_blocks(inode); + unsigned char *types = NULL; + unsigned int bit_pos = 0, start_bit_pos = 0; + int over = 0; + struct f2fs_dentry_block *dentry_blk = NULL; + struct f2fs_dir_entry *de = NULL; + struct page *dentry_page = NULL; + unsigned int n = 0; + unsigned char d_type = DT_UNKNOWN; + int slots; + + types = f2fs_filetype_table; + bit_pos = (pos % NR_DENTRY_IN_BLOCK); + n = (pos / NR_DENTRY_IN_BLOCK); + + for ( ; n < npages; n++) { + dentry_page = get_lock_data_page(inode, n); + if (IS_ERR(dentry_page)) + continue; + + start_bit_pos = bit_pos; + dentry_blk = kmap(dentry_page); + while (bit_pos < NR_DENTRY_IN_BLOCK) { + d_type = DT_UNKNOWN; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_pos); + if (bit_pos >= NR_DENTRY_IN_BLOCK) + break; + + de = &dentry_blk->dentry[bit_pos]; + if (types && de->file_type < F2FS_FT_MAX) + d_type = types[de->file_type]; + + over = filldir(dirent, + dentry_blk->filename[bit_pos], + le16_to_cpu(de->name_len), + (n * NR_DENTRY_IN_BLOCK) + bit_pos, + le32_to_cpu(de->ino), d_type); + if (over) { + file->f_pos += bit_pos - start_bit_pos; + goto success; + } + slots = (le16_to_cpu(de->name_len) + F2FS_NAME_LEN - 1) + / F2FS_NAME_LEN; + bit_pos += slots; + } + bit_pos = 0; + file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + dentry_page = NULL; + } +success: + if (dentry_page && !IS_ERR(dentry_page)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + } + + return 0; +} + +const struct file_operations f2fs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = f2fs_readdir, + .fsync = f2fs_sync_file, + .unlocked_ioctl = f2fs_ioctl, +}; diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c new file mode 100644 index 000000000000..098a1963d7c7 --- /dev/null +++ b/fs/f2fs/hash.c @@ -0,0 +1,98 @@ +/** + * fs/f2fs/hash.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include "f2fs.h" + +/* + * Hashing code copied from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(unsigned int buf[4], unsigned int const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) +{ + unsigned pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num * 4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +f2fs_hash_t f2fs_dentry_hash(const char *name, int len) +{ + __u32 hash, minor_hash; + f2fs_hash_t f2fs_hash; + const char *p; + __u32 in[8], buf[4]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + + f2fs_hash = hash; + f2fs_hash &= ~F2FS_HASH_COL_BIT; + return f2fs_hash; +} -- cgit v1.2.1 From af48b85b8cd3fbb12c9b6759c16db6d69c0b03da Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:12:17 +0900 Subject: f2fs: add xattr and acl functionalities This implements xattr and acl functionalities. - F2FS uses a node page to contain use extended attributes. Signed-off-by: Changman Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 465 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/acl.h | 57 +++++++ fs/f2fs/xattr.c | 389 +++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/xattr.h | 145 ++++++++++++++++++ 4 files changed, 1056 insertions(+) create mode 100644 fs/f2fs/acl.c create mode 100644 fs/f2fs/acl.h create mode 100644 fs/f2fs/xattr.c create mode 100644 fs/f2fs/xattr.h (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c new file mode 100644 index 000000000000..dff2a2bfa755 --- /dev/null +++ b/fs/f2fs/acl.c @@ -0,0 +1,465 @@ +/** + * fs/f2fs/acl.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +static inline size_t f2fs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct f2fs_acl_header) + + count * sizeof(struct f2fs_acl_entry_short); + } else { + return sizeof(struct f2fs_acl_header) + + 4 * sizeof(struct f2fs_acl_entry_short) + + (count - 4) * sizeof(struct f2fs_acl_entry); + } +} + +static inline int f2fs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(struct f2fs_acl_header); + s = size - 4 * sizeof(struct f2fs_acl_entry_short); + if (s < 0) { + if (size % sizeof(struct f2fs_acl_entry_short)) + return -1; + return size / sizeof(struct f2fs_acl_entry_short); + } else { + if (s % sizeof(struct f2fs_acl_entry)) + return -1; + return s / sizeof(struct f2fs_acl_entry) + 4; + } +} + +static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) +{ + int i, count; + struct posix_acl *acl; + struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; + struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); + const char *end = value + size; + + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + + count = f2fs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + + if ((char *)entry > end) + goto fail; + + acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + acl->a_entries[i].e_id = ACL_UNDEFINED_ID; + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + + case ACL_USER: + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + default: + goto fail; + } + } + if ((char *)entry != end) + goto fail; + return acl; +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + struct f2fs_acl_header *f2fs_acl; + struct f2fs_acl_entry *entry; + int i; + + f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * + sizeof(struct f2fs_acl_entry), GFP_KERNEL); + if (!f2fs_acl) + return ERR_PTR(-ENOMEM); + + f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); + entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); + + for (i = 0; i < acl->a_count; i++) { + + entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[i].e_uid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[i].e_gid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + default: + goto fail; + } + } + *size = f2fs_acl_size(acl->a_count); + return (void *)f2fs_acl; + +fail: + kfree(f2fs_acl); + return ERR_PTR(-EINVAL); +} + +struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + void *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(sbi, POSIX_ACL)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (type == ACL_TYPE_ACCESS) + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + + retval = f2fs_getxattr(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + retval = f2fs_getxattr(inode, name_index, "", value, retval); + } + + if (retval < 0) { + if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); + } else { + acl = f2fs_acl_from_disk(value, retval); + } + kfree(value); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode_info *fi = F2FS_I(inode); + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (!test_opt(sbi, POSIX_ACL)) + return 0; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + set_acl_inode(fi, inode->i_mode); + if (error == 0) + acl = NULL; + } + break; + + case ACL_TYPE_DEFAULT: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = f2fs_acl_to_disk(acl, &size); + if (IS_ERR(value)) { + cond_clear_inode_flag(fi, FI_ACL_MODE); + return (int)PTR_ERR(value); + } + } + + error = f2fs_setxattr(inode, name_index, "", value, size); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + cond_clear_inode_flag(fi, FI_ACL_MODE); + return error; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(sbi, POSIX_ACL)) { + acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current_umask(); + } + + if (test_opt(sbi, POSIX_ACL) && acl) { + + if (S_ISDIR(inode->i_mode)) { + error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + if (error < 0) + return error; + if (error > 0) + error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); + } +cleanup: + posix_acl_release(acl); + return error; +} + +int f2fs_acl_chmod(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct posix_acl *acl; + int error; + mode_t mode = get_inode_mode(inode); + + if (!test_opt(sbi, POSIX_ACL)) + return 0; + if (S_ISLNK(mode)) + return -EOPNOTSUPP; + + acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (error) + return error; + error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + return error; +} + +static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + const char *xname = POSIX_ACL_XATTR_DEFAULT; + size_t size; + + if (!test_opt(sbi, POSIX_ACL)) + return 0; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(sbi, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = f2fs_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + return -ENODATA; + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(sbi, POSIX_ACL)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else { + acl = NULL; + } + + error = f2fs_set_acl(inode, type, acl); + +release_and_out: + posix_acl_release(acl); + return error; +} + +const struct xattr_handler f2fs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = f2fs_xattr_list_acl, + .get = f2fs_xattr_get_acl, + .set = f2fs_xattr_set_acl, +}; + +const struct xattr_handler f2fs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = f2fs_xattr_list_acl, + .get = f2fs_xattr_get_acl, + .set = f2fs_xattr_set_acl, +}; + +static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; + size_t size; + + if (type != F2FS_XATTR_INDEX_ADVISE) + return 0; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + + *((char *)buffer) = F2FS_I(inode)->i_advise; + return sizeof(char); +} + +static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value == NULL) + return -EINVAL; + + F2FS_I(inode)->i_advise |= *(char *)value; + return 0; +} + +const struct xattr_handler f2fs_xattr_advise_handler = { + .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .flags = F2FS_XATTR_INDEX_ADVISE, + .list = f2fs_xattr_advise_list, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, +}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h new file mode 100644 index 000000000000..c97675e18fe2 --- /dev/null +++ b/fs/f2fs/acl.h @@ -0,0 +1,57 @@ +/** + * fs/f2fs/acl.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.h + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_ACL_H__ +#define __F2FS_ACL_H__ + +#include + +#define F2FS_ACL_VERSION 0x0001 + +struct f2fs_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +struct f2fs_acl_entry_short { + __le16 e_tag; + __le16 e_perm; +}; + +struct f2fs_acl_header { + __le32 a_version; +}; + +#ifdef CONFIG_F2FS_FS_POSIX_ACL + +extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); +extern int f2fs_acl_chmod(struct inode *inode); +extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +#else +#define f2fs_check_acl NULL +#define f2fs_get_acl NULL +#define f2fs_set_acl NULL + +static inline int f2fs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif +#endif /* __F2FS_ACL_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c new file mode 100644 index 000000000000..aca50fe163f6 --- /dev/null +++ b/fs/f2fs/xattr.c @@ -0,0 +1,389 @@ +/** + * fs/f2fs/xattr.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher + * + * Fix by Harrison Xing . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "f2fs.h" +#include "xattr.h" + +static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + int total_len, prefix_len = 0; + const char *prefix = NULL; + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + prefix = XATTR_USER_PREFIX; + prefix_len = XATTR_USER_PREFIX_LEN; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prefix = XATTR_TRUSTED_PREFIX; + prefix_len = XATTR_TRUSTED_PREFIX_LEN; + break; + default: + return -EINVAL; + } + + total_len = prefix_len + name_len + 1; + if (list && total_len <= list_size) { + memcpy(list, prefix, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + default: + return -EINVAL; + } + if (strcmp(name, "") == 0) + return -EINVAL; + return f2fs_getxattr(dentry->d_inode, type, name, + buffer, size); +} + +static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + switch (type) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + default: + return -EINVAL; + } + if (strcmp(name, "") == 0) + return -EINVAL; + + return f2fs_setxattr(dentry->d_inode, type, name, value, size); +} + +const struct xattr_handler f2fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = F2FS_XATTR_INDEX_USER, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = F2FS_XATTR_INDEX_TRUSTED, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +static const struct xattr_handler *f2fs_xattr_handler_map[] = { + [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, +#endif + [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, + [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, +}; + +const struct xattr_handler *f2fs_xattr_handlers[] = { + &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + &f2fs_xattr_acl_access_handler, + &f2fs_xattr_acl_default_handler, +#endif + &f2fs_xattr_trusted_handler, + &f2fs_xattr_advise_handler, + NULL, +}; + +static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[name_index]; + return handler; +} + +int f2fs_getxattr(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_xattr_entry *entry; + struct page *page; + void *base_addr; + int error = 0, found = 0; + int value_len, name_len; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + + if (!fi->i_xattr_nid) + return -ENODATA; + + page = get_node_page(sbi, fi->i_xattr_nid); + base_addr = page_address(page); + + list_for_each_xattr(entry, base_addr) { + if (entry->e_name_index != name_index) + continue; + if (entry->e_name_len != name_len) + continue; + if (!memcmp(entry->e_name, name, name_len)) { + found = 1; + break; + } + } + if (!found) { + error = -ENODATA; + goto cleanup; + } + + value_len = le16_to_cpu(entry->e_value_size); + + if (buffer && value_len > buffer_size) { + error = -ERANGE; + goto cleanup; + } + + if (buffer) { + char *pval = entry->e_name + entry->e_name_len; + memcpy(buffer, pval, value_len); + } + error = value_len; + +cleanup: + f2fs_put_page(page, 1); + return error; +} + +ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_xattr_entry *entry; + struct page *page; + void *base_addr; + int error = 0; + size_t rest = buffer_size; + + if (!fi->i_xattr_nid) + return 0; + + page = get_node_page(sbi, fi->i_xattr_nid); + base_addr = page_address(page); + + list_for_each_xattr(entry, base_addr) { + const struct xattr_handler *handler = + f2fs_xattr_handler(entry->e_name_index); + size_t size; + + if (!handler) + continue; + + size = handler->list(dentry, buffer, rest, entry->e_name, + entry->e_name_len, handler->flags); + if (buffer && size > rest) { + error = -ERANGE; + goto cleanup; + } + + if (buffer) + buffer += size; + rest -= size; + } + error = buffer_size - rest; +cleanup: + f2fs_put_page(page, 1); + return error; +} + +int f2fs_setxattr(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_xattr_header *header = NULL; + struct f2fs_xattr_entry *here, *last; + struct page *page; + void *base_addr; + int error, found, free, name_len, newsize; + char *pval; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + + if (value == NULL) + value_len = 0; + + if (name_len > 255 || value_len > MAX_VALUE_LEN) + return -ERANGE; + + mutex_lock_op(sbi, NODE_NEW); + if (!fi->i_xattr_nid) { + /* Allocate new attribute block */ + struct dnode_of_data dn; + + if (!alloc_nid(sbi, &fi->i_xattr_nid)) { + mutex_unlock_op(sbi, NODE_NEW); + return -ENOSPC; + } + set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); + mark_inode_dirty(inode); + + page = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(page)) { + alloc_nid_failed(sbi, fi->i_xattr_nid); + fi->i_xattr_nid = 0; + mutex_unlock_op(sbi, NODE_NEW); + return PTR_ERR(page); + } + + alloc_nid_done(sbi, fi->i_xattr_nid); + base_addr = page_address(page); + header = XATTR_HDR(base_addr); + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } else { + /* The inode already has an extended attribute block. */ + page = get_node_page(sbi, fi->i_xattr_nid); + if (IS_ERR(page)) { + mutex_unlock_op(sbi, NODE_NEW); + return PTR_ERR(page); + } + + base_addr = page_address(page); + header = XATTR_HDR(base_addr); + } + + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + error = -EIO; + goto cleanup; + } + + /* find entry with wanted name. */ + found = 0; + list_for_each_xattr(here, base_addr) { + if (here->e_name_index != name_index) + continue; + if (here->e_name_len != name_len) + continue; + if (!memcmp(here->e_name, name, name_len)) { + found = 1; + break; + } + } + + last = here; + + while (!IS_XATTR_LAST_ENTRY(last)) + last = XATTR_NEXT_ENTRY(last); + + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + + name_len + value_len); + + /* 1. Check space */ + if (value) { + /* If value is NULL, it is remove operation. + * In case of update operation, we caculate free. + */ + free = MIN_OFFSET - ((char *)last - (char *)header); + if (found) + free = free - ENTRY_SIZE(here); + + if (free < newsize) { + error = -ENOSPC; + goto cleanup; + } + } + + /* 2. Remove old entry */ + if (found) { + /* If entry is found, remove old entry. + * If not found, remove operation is not needed. + */ + struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); + int oldsize = ENTRY_SIZE(here); + + memmove(here, next, (char *)last - (char *)next); + last = (struct f2fs_xattr_entry *)((char *)last - oldsize); + memset(last, 0, oldsize); + } + + /* 3. Write new entry */ + if (value) { + /* Before we come here, old entry is removed. + * We just write new entry. */ + memset(last, 0, newsize); + last->e_name_index = name_index; + last->e_name_len = name_len; + memcpy(last->e_name, name, name_len); + pval = last->e_name + name_len; + memcpy(pval, value, value_len); + last->e_value_size = cpu_to_le16(value_len); + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + if (is_inode_flag_set(fi, FI_ACL_MODE)) { + inode->i_mode = fi->i_acl_mode; + inode->i_ctime = CURRENT_TIME; + clear_inode_flag(fi, FI_ACL_MODE); + } + f2fs_write_inode(inode, NULL); + mutex_unlock_op(sbi, NODE_NEW); + + return 0; +cleanup: + f2fs_put_page(page, 1); + mutex_unlock_op(sbi, NODE_NEW); + return error; +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h new file mode 100644 index 000000000000..29b0a08e1e14 --- /dev/null +++ b/fs/f2fs/xattr.h @@ -0,0 +1,145 @@ +/** + * fs/f2fs/xattr.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.h + * + * On-disk format of extended attributes for the ext2 filesystem. + * + * (C) 2001 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_XATTR_H__ +#define __F2FS_XATTR_H__ + +#include +#include + +/* Magic value in attribute blocks */ +#define F2FS_XATTR_MAGIC 0xF2F52011 + +/* Maximum number of references to one attribute block */ +#define F2FS_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise" +#define F2FS_XATTR_INDEX_USER 1 +#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define F2FS_XATTR_INDEX_TRUSTED 4 +#define F2FS_XATTR_INDEX_LUSTRE 5 +#define F2FS_XATTR_INDEX_SECURITY 6 +#define F2FS_XATTR_INDEX_ADVISE 7 + +struct f2fs_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct f2fs_xattr_entry { + __u8 e_name_index; + __u8 e_name_len; + __le16 e_value_size; /* size of attribute value */ + char e_name[0]; /* attribute name */ +}; + +#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) +#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_ROUND (3) + +#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) + +#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ + entry->e_name_len + le16_to_cpu(entry->e_value_size))) + +#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ + ENTRY_SIZE(entry))) + +#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define list_for_each_xattr(entry, addr) \ + for (entry = XATTR_FIRST_ENTRY(addr);\ + !IS_XATTR_LAST_ENTRY(entry);\ + entry = XATTR_NEXT_ENTRY(entry)) + + +#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ + sizeof(struct node_footer) - \ + sizeof(__u32)) + +#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) + +/** + * On-disk structure of f2fs_xattr + * We use only 1 block for xattr. + * + * +--------------------+ + * | f2fs_xattr_header | + * | | + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 1 | + * | .e_name_len = 3 | + * | .e_value_size = 14 | + * | .e_name = "foo" | + * | "value_of_xattr" |<- value_offs = e_name + e_name_len + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 4 | + * | .e_name = "bar" | + * +--------------------+ + * | | + * | Free | + * | | + * +--------------------+<- MIN_OFFSET + * | node_footer | + * | (nid, ino, offset) | + * +--------------------+ + * + **/ + +#ifdef CONFIG_F2FS_FS_XATTR +extern const struct xattr_handler f2fs_xattr_user_handler; +extern const struct xattr_handler f2fs_xattr_trusted_handler; +extern const struct xattr_handler f2fs_xattr_acl_access_handler; +extern const struct xattr_handler f2fs_xattr_acl_default_handler; +extern const struct xattr_handler f2fs_xattr_advise_handler; + +extern const struct xattr_handler *f2fs_xattr_handlers[]; + +extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len); +extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size); +extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size); + +#else + +#define f2fs_xattr_handlers NULL +static inline int f2fs_setxattr(struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len) +{ + return -EOPNOTSUPP; +} +static inline int f2fs_getxattr(struct inode *inode, int name_index, + const char *name, void *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} +static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif + +#endif /* __F2FS_XATTR_H__ */ -- cgit v1.2.1 From 7bc0900347e069a1676d28ad6f98cafaf8cfd6e9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:13:01 +0900 Subject: f2fs: add garbage collection functions This adds on-demand and background cleaning functions. - The basic background cleaning policy is trying to do cleaning jobs as much as possible whenever the system is idle. Once the background cleaning is done, the cleaner sleeps an amount of time not to interfere with VFS calls. The time is dynamically adjusted according to the status of whole segments, which is decreased when the following conditions are satisfied. . GC is not conducted currently, and . IO subsystem is idle by checking the number of requets in bdev's request list, and . There are enough dirty segments. Otherwise, the time is increased incrementally until to the maximum time. Note that, min and max times are 10 secs and 30 secs by default. - F2FS adopts a default victim selection policy where background cleaning uses a cost-benefit algorithm, while on-demand cleaning uses a greedy algorithm. - The method of moving data during the cleaning is slightly different between background and on-demand cleaning schemes. In the case of background cleaning, F2FS loads the data, and marks them as dirty. Then, F2FS expects that the data will be moved by flusher or VM. In the case of on-demand cleaning, F2FS should move the data right away. - In order to identify valid blocks in a victim segment, F2FS scans the bitmap of the segment managed as an SIT entry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 742 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/gc.h | 117 ++++++++++ 2 files changed, 859 insertions(+) create mode 100644 fs/f2fs/gc.c create mode 100644 fs/f2fs/gc.h (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c new file mode 100644 index 000000000000..46774ce3ae03 --- /dev/null +++ b/fs/f2fs/gc.c @@ -0,0 +1,742 @@ +/** + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static struct kmem_cache *winode_slab; + +static int gc_thread_func(void *data) +{ + struct f2fs_sb_info *sbi = data; + wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + long wait_ms; + + wait_ms = GC_THREAD_MIN_SLEEP_TIME; + + do { + if (try_to_freeze()) + continue; + else + wait_event_interruptible_timeout(*wq, + kthread_should_stop(), + msecs_to_jiffies(wait_ms)); + if (kthread_should_stop()) + break; + + f2fs_balance_fs(sbi); + + if (!test_opt(sbi, BG_GC)) + continue; + + /* + * [GC triggering condition] + * 0. GC is not conducted currently. + * 1. There are enough dirty segments. + * 2. IO subsystem is idle by checking the # of writeback pages. + * 3. IO subsystem is idle by checking the # of requests in + * bdev's request list. + * + * Note) We have to avoid triggering GCs too much frequently. + * Because it is possible that some segments can be + * invalidated soon after by user update or deletion. + * So, I'd like to wait some time to collect dirty segments. + */ + if (!mutex_trylock(&sbi->gc_mutex)) + continue; + + if (!is_idle(sbi)) { + wait_ms = increase_sleep_time(wait_ms); + mutex_unlock(&sbi->gc_mutex); + continue; + } + + if (has_enough_invalid_blocks(sbi)) + wait_ms = decrease_sleep_time(wait_ms); + else + wait_ms = increase_sleep_time(wait_ms); + + sbi->bg_gc++; + + if (f2fs_gc(sbi, 1) == GC_NONE) + wait_ms = GC_THREAD_NOGC_SLEEP_TIME; + else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) + wait_ms = GC_THREAD_MAX_SLEEP_TIME; + + } while (!kthread_should_stop()); + return 0; +} + +int start_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = NULL; + + gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) + return -ENOMEM; + + sbi->gc_thread = gc_th; + init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, + GC_THREAD_NAME); + if (IS_ERR(gc_th->f2fs_gc_task)) { + kfree(gc_th); + return -ENOMEM; + } + return 0; +} + +void stop_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) + return; + kthread_stop(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; +} + +static int select_gc_type(int gc_type) +{ + return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, + int type, struct victim_sel_policy *p) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (p->alloc_mode) { + p->gc_mode = GC_GREEDY; + p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->ofs_unit = 1; + } else { + p->gc_mode = select_gc_type(gc_type); + p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->ofs_unit = sbi->segs_per_sec; + } + p->offset = sbi->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + if (p->gc_mode == GC_GREEDY) + return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + else if (p->gc_mode == GC_CB) + return UINT_MAX; + else /* No other gc_mode */ + return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + /* + * If the gc_type is FG_GC, we can select victim segments + * selected by background GC before. + * Those segments guarantee they have small valid blocks. + */ + segno = find_next_bit(dirty_i->victim_segmap[BG_GC], + TOTAL_SEGS(sbi), 0); + if (segno < TOTAL_SEGS(sbi)) { + clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + return segno; + } + return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SECNO(sbi, segno); + unsigned int start = secno * sbi->segs_per_sec; + unsigned long long mtime = 0; + unsigned int vblocks; + unsigned char age = 0; + unsigned char u; + unsigned int i; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + mtime = div_u64(mtime, sbi->segs_per_sec); + vblocks = div_u64(vblocks, sbi->segs_per_sec); + + u = (vblocks * 100) >> sbi->log_blocks_per_seg; + + /* Handle if the system time is changed by user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (sit_i->max_mtime != sit_i->min_mtime) + age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), + sit_i->max_mtime - sit_i->min_mtime); + + return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, + struct victim_sel_policy *p) +{ + if (p->alloc_mode == SSR) + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + /* alloc_mode == LFS */ + if (p->gc_mode == GC_GREEDY) + return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + else + return get_cb_cost(sbi, segno); +} + +/** + * This function is called from two pathes. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, + unsigned int *result, int gc_type, int type, char alloc_mode) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct victim_sel_policy p; + unsigned int segno; + int nsearched = 0; + + p.alloc_mode = alloc_mode; + select_policy(sbi, gc_type, type, &p); + + p.min_segno = NULL_SEGNO; + p.min_cost = get_max_cost(sbi, &p); + + mutex_lock(&dirty_i->seglist_lock); + + if (p.alloc_mode == LFS && gc_type == FG_GC) { + p.min_segno = check_bg_victims(sbi); + if (p.min_segno != NULL_SEGNO) + goto got_it; + } + + while (1) { + unsigned long cost; + + segno = find_next_bit(p.dirty_segmap, + TOTAL_SEGS(sbi), p.offset); + if (segno >= TOTAL_SEGS(sbi)) { + if (sbi->last_victim[p.gc_mode]) { + sbi->last_victim[p.gc_mode] = 0; + p.offset = 0; + continue; + } + break; + } + p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + + if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) + continue; + if (gc_type == BG_GC && + test_bit(segno, dirty_i->victim_segmap[BG_GC])) + continue; + if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) + continue; + + cost = get_gc_cost(sbi, segno, &p); + + if (p.min_cost > cost) { + p.min_segno = segno; + p.min_cost = cost; + } + + if (cost == get_max_cost(sbi, &p)) + continue; + + if (nsearched++ >= MAX_VICTIM_SEARCH) { + sbi->last_victim[p.gc_mode] = segno; + break; + } + } +got_it: + if (p.min_segno != NULL_SEGNO) { + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + if (p.alloc_mode == LFS) { + int i; + for (i = 0; i < p.ofs_unit; i++) + set_bit(*result + i, + dirty_i->victim_segmap[gc_type]); + } + } + mutex_unlock(&dirty_i->seglist_lock); + + return (p.min_segno == NULL_SEGNO) ? 0 : 1; +} + +static const struct victim_selection default_v_ops = { + .get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) +{ + struct list_head *this; + struct inode_entry *ie; + + list_for_each(this, ilist) { + ie = list_entry(this, struct inode_entry, list); + if (ie->inode->i_ino == ino) + return ie->inode; + } + return NULL; +} + +static void add_gc_inode(struct inode *inode, struct list_head *ilist) +{ + struct list_head *this; + struct inode_entry *new_ie, *ie; + + list_for_each(this, ilist) { + ie = list_entry(this, struct inode_entry, list); + if (ie->inode == inode) { + iput(inode); + return; + } + } +repeat: + new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); + if (!new_ie) { + cond_resched(); + goto repeat; + } + new_ie->inode = inode; + list_add_tail(&new_ie->list, ilist); +} + +static void put_gc_inode(struct list_head *ilist) +{ + struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, ilist, list) { + iput(ie->inode); + list_del(&ie->list); + kmem_cache_free(winode_slab, ie); + } +} + +static int check_valid_map(struct f2fs_sb_info *sbi, + unsigned int segno, int offset) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct seg_entry *sentry; + int ret; + + mutex_lock(&sit_i->sentry_lock); + sentry = get_seg_entry(sbi, segno); + ret = f2fs_test_bit(offset, sentry->cur_valid_map); + mutex_unlock(&sit_i->sentry_lock); + return ret ? GC_OK : GC_NEXT; +} + +/** + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static int gc_node_segment(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ + bool initial = true; + struct f2fs_summary *entry; + int off; + +next_step: + entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + nid_t nid = le32_to_cpu(entry->nid); + struct page *node_page; + int err; + + /* + * It makes sure that free segments are able to write + * all the dirty node pages before CP after this CP. + * So let's check the space of dirty node pages. + */ + if (should_do_checkpoint(sbi)) { + mutex_lock(&sbi->cp_mutex); + block_operations(sbi); + return GC_BLOCKED; + } + + err = check_valid_map(sbi, segno, off); + if (err == GC_ERROR) + return err; + else if (err == GC_NEXT) + continue; + + if (initial) { + ra_node_page(sbi, nid); + continue; + } + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + continue; + + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + f2fs_put_page(node_page, 1); + stat_inc_node_blk_count(sbi, 1); + } + if (initial) { + initial = false; + goto next_step; + } + + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + sync_node_pages(sbi, 0, &wbc); + } + return GC_DONE; +} + +/** + * Calculate start block index that this node page contains + */ +block_t start_bidx_of_node(unsigned int node_ofs) +{ + block_t start_bidx; + unsigned int bidx, indirect_blks; + int dec; + + indirect_blks = 2 * NIDS_PER_BLOCK + 4; + + start_bidx = 1; + if (node_ofs == 0) { + start_bidx = 0; + } else if (node_ofs <= 2) { + bidx = node_ofs - 1; + } else if (node_ofs <= indirect_blks) { + dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; + } else { + dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; + } + + if (start_bidx) + start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; + return start_bidx; +} + +static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ + struct page *node_page; + nid_t nid; + unsigned int ofs_in_node; + block_t source_blkaddr; + + nid = le32_to_cpu(sum->nid); + ofs_in_node = le16_to_cpu(sum->ofs_in_node); + + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return GC_NEXT; + + get_node_info(sbi, nid, dni); + + if (sum->version != dni->version) { + f2fs_put_page(node_page, 1); + return GC_NEXT; + } + + *nofs = ofs_of_node(node_page); + source_blkaddr = datablock_addr(node_page, ofs_in_node); + f2fs_put_page(node_page, 1); + + if (source_blkaddr != blkaddr) + return GC_NEXT; + return GC_OK; +} + +static void move_data_page(struct inode *inode, struct page *page, int gc_type) +{ + if (page->mapping != inode->i_mapping) + goto out; + + if (inode != page->mapping->host) + goto out; + + if (PageWriteback(page)) + goto out; + + if (gc_type == BG_GC) { + set_page_dirty(page); + set_cold_data(page); + } else { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + mutex_lock_op(sbi, DATA_WRITE); + if (clear_page_dirty_for_io(page) && + S_ISDIR(inode->i_mode)) { + dec_page_count(sbi, F2FS_DIRTY_DENTS); + inode_dec_dirty_dents(inode); + } + set_cold_data(page); + do_write_data_page(page); + mutex_unlock_op(sbi, DATA_WRITE); + clear_cold_data(page); + } +out: + f2fs_put_page(page, 1); +} + +/** + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct list_head *ilist, unsigned int segno, int gc_type) +{ + struct super_block *sb = sbi->sb; + struct f2fs_summary *entry; + block_t start_addr; + int err, off; + int phase = 0; + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + struct page *data_page; + struct inode *inode; + struct node_info dni; /* dnode info for the data */ + unsigned int ofs_in_node, nofs; + block_t start_bidx; + + /* + * It makes sure that free segments are able to write + * all the dirty node pages before CP after this CP. + * So let's check the space of dirty node pages. + */ + if (should_do_checkpoint(sbi)) { + mutex_lock(&sbi->cp_mutex); + block_operations(sbi); + err = GC_BLOCKED; + goto stop; + } + + err = check_valid_map(sbi, segno, off); + if (err == GC_ERROR) + goto stop; + else if (err == GC_NEXT) + continue; + + if (phase == 0) { + ra_node_page(sbi, le32_to_cpu(entry->nid)); + continue; + } + + /* Get an inode by ino with checking validity */ + err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); + if (err == GC_ERROR) + goto stop; + else if (err == GC_NEXT) + continue; + + if (phase == 1) { + ra_node_page(sbi, dni.ino); + continue; + } + + start_bidx = start_bidx_of_node(nofs); + ofs_in_node = le16_to_cpu(entry->ofs_in_node); + + if (phase == 2) { + inode = f2fs_iget_nowait(sb, dni.ino); + if (IS_ERR(inode)) + continue; + + data_page = find_data_page(inode, + start_bidx + ofs_in_node); + if (IS_ERR(data_page)) + goto next_iput; + + f2fs_put_page(data_page, 0); + add_gc_inode(inode, ilist); + } else { + inode = find_gc_inode(dni.ino, ilist); + if (inode) { + data_page = get_lock_data_page(inode, + start_bidx + ofs_in_node); + if (IS_ERR(data_page)) + continue; + move_data_page(inode, data_page, gc_type); + stat_inc_data_blk_count(sbi, 1); + } + } + continue; +next_iput: + iput(inode); + } + if (++phase < 4) + goto next_step; + err = GC_DONE; +stop: + if (gc_type == FG_GC) + f2fs_submit_bio(sbi, DATA, true); + return err; +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, + int gc_type, int type) +{ + struct sit_info *sit_i = SIT_I(sbi); + int ret; + mutex_lock(&sit_i->sentry_lock); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); + mutex_unlock(&sit_i->sentry_lock); + return ret; +} + +static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, + struct list_head *ilist, int gc_type) +{ + struct page *sum_page; + struct f2fs_summary_block *sum; + int ret = GC_DONE; + + /* read segment summary of victim */ + sum_page = get_sum_page(sbi, segno); + if (IS_ERR(sum_page)) + return GC_ERROR; + + /* + * CP needs to lock sum_page. In this time, we don't need + * to lock this page, because this summary page is not gone anywhere. + * Also, this page is not gonna be updated before GC is done. + */ + unlock_page(sum_page); + sum = page_address(sum_page); + + switch (GET_SUM_TYPE((&sum->footer))) { + case SUM_TYPE_NODE: + ret = gc_node_segment(sbi, sum->entries, segno, gc_type); + break; + case SUM_TYPE_DATA: + ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); + break; + } + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); + stat_inc_call_count(sbi->stat_info); + + f2fs_put_page(sum_page, 0); + return ret; +} + +int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +{ + unsigned int segno; + int old_free_secs, cur_free_secs; + int gc_status, nfree; + struct list_head ilist; + int gc_type = BG_GC; + + INIT_LIST_HEAD(&ilist); +gc_more: + nfree = 0; + gc_status = GC_NONE; + + if (has_not_enough_free_secs(sbi)) + old_free_secs = reserved_sections(sbi); + else + old_free_secs = free_sections(sbi); + + while (sbi->sb->s_flags & MS_ACTIVE) { + int i; + if (has_not_enough_free_secs(sbi)) + gc_type = FG_GC; + + cur_free_secs = free_sections(sbi) + nfree; + + /* We got free space successfully. */ + if (nGC < cur_free_secs - old_free_secs) + break; + + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + break; + + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * do_garbage_collect will give us three gc_status: + * GC_ERROR, GC_DONE, and GC_BLOCKED. + * If GC is finished uncleanly, we have to return + * the victim to dirty segment list. + */ + gc_status = do_garbage_collect(sbi, segno + i, + &ilist, gc_type); + if (gc_status != GC_DONE) + goto stop; + nfree++; + } + } +stop: + if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { + write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); + if (nfree) + goto gc_more; + } + mutex_unlock(&sbi->gc_mutex); + + put_gc_inode(&ilist); + BUG_ON(!list_empty(&ilist)); + return gc_status; +} + +void build_gc_manager(struct f2fs_sb_info *sbi) +{ + DIRTY_I(sbi)->v_ops = &default_v_ops; +} + +int create_gc_caches(void) +{ + winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", + sizeof(struct inode_entry), NULL); + if (!winode_slab) + return -ENOMEM; + return 0; +} + +void destroy_gc_caches(void) +{ + kmem_cache_destroy(winode_slab); +} diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h new file mode 100644 index 000000000000..cf42a554ca0a --- /dev/null +++ b/fs/f2fs/gc.h @@ -0,0 +1,117 @@ +/** + * fs/f2fs/gc.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define GC_THREAD_NAME "f2fs_gc_task" +#define GC_THREAD_MIN_WB_PAGES 1 /* + * a threshold to determine + * whether IO subsystem is idle + * or not + */ +#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */ +#define GC_THREAD_MAX_SLEEP_TIME 30000 +#define GC_THREAD_NOGC_SLEEP_TIME 10000 +#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ +#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ + +/* Search max. number of dirty segments to select a victim segment */ +#define MAX_VICTIM_SEARCH 20 + +enum { + GC_NONE = 0, + GC_ERROR, + GC_OK, + GC_NEXT, + GC_BLOCKED, + GC_DONE, +}; + +struct f2fs_gc_kthread { + struct task_struct *f2fs_gc_task; + wait_queue_head_t gc_wait_queue_head; +}; + +struct inode_entry { + struct list_head list; + struct inode *inode; +}; + +/** + * inline functions + */ +static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) +{ + if (free_segments(sbi) < overprovision_segments(sbi)) + return 0; + else + return (free_segments(sbi) - overprovision_segments(sbi)) + << sbi->log_blocks_per_seg; +} + +static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +{ + return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; +} + +static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +{ + block_t reclaimable_user_blocks = sbi->user_block_count - + written_block_count(sbi); + return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; +} + +static inline long increase_sleep_time(long wait) +{ + wait += GC_THREAD_MIN_SLEEP_TIME; + if (wait > GC_THREAD_MAX_SLEEP_TIME) + wait = GC_THREAD_MAX_SLEEP_TIME; + return wait; +} + +static inline long decrease_sleep_time(long wait) +{ + wait -= GC_THREAD_MIN_SLEEP_TIME; + if (wait <= GC_THREAD_MIN_SLEEP_TIME) + wait = GC_THREAD_MIN_SLEEP_TIME; + return wait; +} + +static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) +{ + block_t invalid_user_blocks = sbi->user_block_count - + written_block_count(sbi); + /* + * Background GC is triggered with the following condition. + * 1. There are a number of invalid blocks. + * 2. There is not enough free space. + */ + if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && + free_user_blocks(sbi) < limit_free_user_blocks(sbi)) + return true; + return false; +} + +static inline int is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); +} + +static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * + (1 << sbi->log_blocks_per_seg); + int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2); +} -- cgit v1.2.1 From d624c96fb3249e5d3dcf4e60a805e5e6b0dd7d91 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:13:32 +0900 Subject: f2fs: add recovery routines for roll-forward This adds roll-forward routines to recover fsynced data. - F2FS uses basically roll-back model with checkpointing. - In order to implement fsync(), there are two approaches as follows. 1. A roll-back model with checkpointing at every fsync() : This is a naive method, but suffers from very low performance. 2. A roll-forward model : F2FS adopts this model where all the fsynced data should be recovered, which were written after checkpointing was done. In order to figure out the data, F2FS keeps a "fsync" mark in direct node blocks. In addition, F2FS remains the location of next node block in each direct node block for reconstructing the chain of node blocks during the recovery. - In order to enhance the performance, F2FS keeps a "dentry" mark also in direct node blocks. If this is set during the recovery, F2FS replays adding a dentry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 375 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 fs/f2fs/recovery.c (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c new file mode 100644 index 000000000000..7a43df0b72c1 --- /dev/null +++ b/fs/f2fs/recovery.c @@ -0,0 +1,375 @@ +/** + * fs/f2fs/recovery.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +static struct kmem_cache *fsync_entry_slab; + +bool space_for_roll_forward(struct f2fs_sb_info *sbi) +{ + if (sbi->last_valid_block_count + sbi->alloc_valid_block_count + > sbi->user_block_count) + return false; + return true; +} + +static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, + nid_t ino) +{ + struct list_head *this; + struct fsync_inode_entry *entry; + + list_for_each(this, head) { + entry = list_entry(this, struct fsync_inode_entry, list); + if (entry->inode->i_ino == ino) + return entry; + } + return NULL; +} + +static int recover_dentry(struct page *ipage, struct inode *inode) +{ + struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); + struct f2fs_inode *raw_inode = &(raw_node->i); + struct dentry dent, parent; + struct f2fs_dir_entry *de; + struct page *page; + struct inode *dir; + int err = 0; + + if (!is_dent_dnode(ipage)) + goto out; + + dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); + if (IS_ERR(dir)) { + err = -EINVAL; + goto out; + } + + parent.d_inode = dir; + dent.d_parent = &parent; + dent.d_name.len = le32_to_cpu(raw_inode->i_namelen); + dent.d_name.name = raw_inode->i_name; + + de = f2fs_find_entry(dir, &dent.d_name, &page); + if (de) { + kunmap(page); + f2fs_put_page(page, 0); + } else { + f2fs_add_link(&dent, inode); + } + iput(dir); +out: + kunmap(ipage); + return err; +} + +static int recover_inode(struct inode *inode, struct page *node_page) +{ + void *kaddr = page_address(node_page); + struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; + struct f2fs_inode *raw_inode = &(raw_node->i); + + inode->i_mode = le32_to_cpu(raw_inode->i_mode); + i_size_write(inode, le64_to_cpu(raw_inode->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + + return recover_dentry(node_page, inode); +} + +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +{ + unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + struct curseg_info *curseg; + struct page *page; + block_t blkaddr; + int err = 0; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + + /* read node page */ + page = alloc_page(GFP_F2FS_ZERO); + if (IS_ERR(page)) + return PTR_ERR(page); + lock_page(page); + + while (1) { + struct fsync_inode_entry *entry; + + if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + goto out; + + if (cp_ver != cpver_of_node(page)) + goto out; + + if (!is_fsync_dnode(page)) + goto next; + + entry = get_fsync_inode(head, ino_of_node(page)); + if (entry) { + entry->blkaddr = blkaddr; + if (IS_INODE(page) && is_dent_dnode(page)) + set_inode_flag(F2FS_I(entry->inode), + FI_INC_LINK); + } else { + if (IS_INODE(page) && is_dent_dnode(page)) { + if (recover_inode_page(sbi, page)) { + err = -ENOMEM; + goto out; + } + } + + /* add this fsync inode to the list */ + entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); + if (!entry) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&entry->list); + list_add_tail(&entry->list, head); + + entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); + if (IS_ERR(entry->inode)) { + err = PTR_ERR(entry->inode); + goto out; + } + entry->blkaddr = blkaddr; + } + if (IS_INODE(page)) { + err = recover_inode(entry->inode, page); + if (err) + goto out; + } +next: + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + ClearPageUptodate(page); + } +out: + unlock_page(page); + __free_pages(page, 0); + return err; +} + +static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, + struct list_head *head) +{ + struct list_head *this; + struct fsync_inode_entry *entry; + list_for_each(this, head) { + entry = list_entry(this, struct fsync_inode_entry, list); + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); + } +} + +static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct seg_entry *sentry; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & + (sbi->blocks_per_seg - 1); + struct f2fs_summary sum; + nid_t ino; + void *kaddr; + struct inode *inode; + struct page *node_page; + block_t bidx; + int i; + + sentry = get_seg_entry(sbi, segno); + if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) + return; + + /* Get the previous summary */ + for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { + sum = curseg->sum_blk->entries[blkoff]; + break; + } + } + if (i > CURSEG_COLD_DATA) { + struct page *sum_page = get_sum_page(sbi, segno); + struct f2fs_summary_block *sum_node; + kaddr = page_address(sum_page); + sum_node = (struct f2fs_summary_block *)kaddr; + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); + } + + /* Get the node page */ + node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); + bidx = start_bidx_of_node(ofs_of_node(node_page)) + + le16_to_cpu(sum.ofs_in_node); + ino = ino_of_node(node_page); + f2fs_put_page(node_page, 1); + + /* Deallocate previous index in the node page */ + inode = f2fs_iget_nowait(sbi->sb, ino); + truncate_hole(inode, bidx, bidx + 1); + iput(inode); +} + +static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, block_t blkaddr) +{ + unsigned int start, end; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + + start = start_bidx_of_node(ofs_of_node(page)); + if (IS_INODE(page)) + end = start + ADDRS_PER_INODE; + else + end = start + ADDRS_PER_BLOCK; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, start, 0)) + return; + + wait_on_page_writeback(dn.node_page); + + get_node_info(sbi, dn.nid, &ni); + BUG_ON(ni.ino != ino_of_node(page)); + BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + + for (; start < end; start++) { + block_t src, dest; + + src = datablock_addr(dn.node_page, dn.ofs_in_node); + dest = datablock_addr(page, dn.ofs_in_node); + + if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { + if (src == NULL_ADDR) { + int err = reserve_new_block(&dn); + /* We should not get -ENOSPC */ + BUG_ON(err); + } + + /* Check the previous node page having this index */ + check_index_in_prev_nodes(sbi, dest); + + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* write dummy data page */ + recover_data_page(sbi, NULL, &sum, src, dest); + update_extent_cache(dest, &dn); + } + dn.ofs_in_node++; + } + + /* write node page in place */ + set_summary(&sum, dn.nid, 0, 0); + if (IS_INODE(dn.node_page)) + sync_inode_page(&dn); + + copy_node_footer(dn.node_page, page); + fill_node_footer(dn.node_page, dn.nid, ni.ino, + ofs_of_node(page), false); + set_page_dirty(dn.node_page); + + recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); + f2fs_put_dnode(&dn); +} + +static void recover_data(struct f2fs_sb_info *sbi, + struct list_head *head, int type) +{ + unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + struct curseg_info *curseg; + struct page *page; + block_t blkaddr; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, type); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + /* read node page */ + page = alloc_page(GFP_NOFS | __GFP_ZERO); + if (IS_ERR(page)) + return; + lock_page(page); + + while (1) { + struct fsync_inode_entry *entry; + + if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + goto out; + + if (cp_ver != cpver_of_node(page)) + goto out; + + entry = get_fsync_inode(head, ino_of_node(page)); + if (!entry) + goto next; + + do_recover_data(sbi, entry->inode, page, blkaddr); + + if (entry->blkaddr == blkaddr) { + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); + } +next: + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + ClearPageUptodate(page); + } +out: + unlock_page(page); + __free_pages(page, 0); + + allocate_new_segments(sbi); +} + +void recover_fsync_data(struct f2fs_sb_info *sbi) +{ + struct list_head inode_list; + + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry), NULL); + if (unlikely(!fsync_entry_slab)) + return; + + INIT_LIST_HEAD(&inode_list); + + /* step #1: find fsynced inode numbers */ + if (find_fsync_dnodes(sbi, &inode_list)) + goto out; + + if (list_empty(&inode_list)) + goto out; + + /* step #2: recover data */ + sbi->por_doing = 1; + recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + sbi->por_doing = 0; + BUG_ON(!list_empty(&inode_list)); +out: + destroy_fsync_dnodes(sbi, &inode_list); + kmem_cache_destroy(fsync_entry_slab); + write_checkpoint(sbi, false, false); +} -- cgit v1.2.1 From 902829aa0b722511369e4e6193e66390bc58e0a2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 Nov 2012 06:50:41 +0900 Subject: f2fs: move proc files to debugfs This moves all of the f2fs debugging files into debugfs. The files are located in /sys/kernel/debug/f2fs/ Note, I think we are generating all of the same information in each of the files for every unique f2fs filesystem in the machine. This copies the functionality that was present in the proc files, but this should be fixed up in the future. Signed-off-by: Greg Kroah-Hartman [jaegeuk.kim@samsung.com: merged 3 debugfs entries into a *status* entry] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100644 fs/f2fs/debug.c (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c new file mode 100644 index 000000000000..a56181c1b28b --- /dev/null +++ b/fs/f2fs/debug.c @@ -0,0 +1,361 @@ +/** + * f2fs debugging statistics + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2012 Linux Foundation + * Copyright (c) 2012 Greg Kroah-Hartman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static LIST_HEAD(f2fs_stat_list); +static struct dentry *debugfs_root; + +void update_general_status(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = sbi->stat_info; + int i; + + /* valid check of the segment numbers */ + si->hit_ext = sbi->read_hit_ext; + si->total_ext = sbi->total_hit_ext; + si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); + si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); + si->ndirty_dirs = sbi->n_dirty_dirs; + si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->rsvd_segs = reserved_segments(sbi); + si->overp_segs = overprovision_segments(sbi); + si->valid_count = valid_user_blocks(sbi); + si->valid_node_count = valid_node_count(sbi); + si->valid_inode_count = valid_inode_count(sbi); + si->utilization = utilization(sbi); + + si->free_segs = free_segments(sbi); + si->free_secs = free_sections(sbi); + si->prefree_count = prefree_segments(sbi); + si->dirty_count = dirty_segments(sbi); + si->node_pages = sbi->node_inode->i_mapping->nrpages; + si->meta_pages = sbi->meta_inode->i_mapping->nrpages; + si->nats = NM_I(sbi)->nat_cnt; + si->sits = SIT_I(sbi)->dirty_sentries; + si->fnids = NM_I(sbi)->fcnt; + si->bg_gc = sbi->bg_gc; + si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_valid = (int)(written_block_count(sbi) >> + sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_invalid = 50 - si->util_free - si->util_valid; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; + si->cursec[i] = curseg->segno / sbi->segs_per_sec; + si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + } + + for (i = 0; i < 2; i++) { + si->segment_count[i] = sbi->segment_count[i]; + si->block_count[i] = sbi->block_count[i]; + } +} + +/** + * This function calculates BDF of every segments + */ +static void update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = sbi->stat_info; + unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + hblks_per_sec = blks_per_sec / 2; + mutex_lock(&sit_i->sentry_lock); + for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + mutex_unlock(&sit_i->sentry_lock); + dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; + si->bimodal = bimodal / dist; + if (si->dirty_count) + si->avg_vblocks = total_vblocks / ndirty; + else + si->avg_vblocks = 0; +} + +/** + * This function calculates memory footprint. + */ +static void update_mem_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = sbi->stat_info; + unsigned npages; + + if (si->base_mem) + goto get_cache; + + si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + si->base_mem += 2 * sizeof(struct f2fs_inode_info); + si->base_mem += sizeof(*sbi->ckpt); + + /* build sm */ + si->base_mem += sizeof(struct f2fs_sm_info); + + /* build sit */ + si->base_mem += sizeof(struct sit_info); + si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); + if (sbi->segs_per_sec > 1) + si->base_mem += sbi->total_sections * + sizeof(struct sec_entry); + si->base_mem += __bitmap_size(sbi, SIT_BITMAP); + + /* build free segmap */ + si->base_mem += sizeof(struct free_segmap_info); + si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(sbi->total_sections); + + /* build curseg */ + si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; + si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + + /* build dirty segmap */ + si->base_mem += sizeof(struct dirty_seglist_info); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + + /* buld nm */ + si->base_mem += sizeof(struct f2fs_nm_info); + si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + + /* build gc */ + si->base_mem += sizeof(struct f2fs_gc_kthread); + +get_cache: + /* free nids */ + si->cache_mem = NM_I(sbi)->fcnt; + si->cache_mem += NM_I(sbi)->nat_cnt; + npages = sbi->node_inode->i_mapping->nrpages; + si->cache_mem += npages << PAGE_CACHE_SHIFT; + npages = sbi->meta_inode->i_mapping->nrpages; + si->cache_mem += npages << PAGE_CACHE_SHIFT; + si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); + si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); +} + +static int stat_show(struct seq_file *s, void *v) +{ + struct f2fs_stat_info *si, *next; + int i = 0; + int j; + + list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + + mutex_lock(&si->stat_lock); + if (!si->sbi) { + mutex_unlock(&si->stat_lock); + continue; + } + update_general_status(si->sbi); + + seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); + seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", + si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "[SSA: %d] [MAIN: %d", + si->ssa_area_segs, si->main_area_segs); + seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", + si->overp_segs, si->rsvd_segs); + seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", + si->valid_node_count, si->valid_inode_count); + seq_printf(s, "Other: %u)\n - Data: %u\n", + si->valid_node_count - si->valid_inode_count, + si->valid_count - si->valid_node_count); + seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", + si->main_area_segs, si->main_area_sections, + si->main_area_zones); + seq_printf(s, " - COLD data: %d, %d, %d\n", + si->curseg[CURSEG_COLD_DATA], + si->cursec[CURSEG_COLD_DATA], + si->curzone[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curseg[CURSEG_WARM_DATA], + si->cursec[CURSEG_WARM_DATA], + si->curzone[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curseg[CURSEG_HOT_DATA], + si->cursec[CURSEG_HOT_DATA], + si->curzone[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curseg[CURSEG_HOT_NODE], + si->cursec[CURSEG_HOT_NODE], + si->curzone[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curseg[CURSEG_WARM_NODE], + si->cursec[CURSEG_WARM_NODE], + si->curzone[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curseg[CURSEG_COLD_NODE], + si->cursec[CURSEG_COLD_NODE], + si->curzone[CURSEG_COLD_NODE]); + seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", + si->main_area_segs - si->dirty_count - + si->prefree_count - si->free_segs, + si->dirty_count); + seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", + si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "GC calls: %d (BG: %d)\n", + si->call_count, si->bg_gc); + seq_printf(s, " - data segments : %d\n", si->data_segs); + seq_printf(s, " - node segments : %d\n", si->node_segs); + seq_printf(s, "Try to move %d blocks\n", si->tot_blks); + seq_printf(s, " - data blocks : %d\n", si->data_blks); + seq_printf(s, " - node blocks : %d\n", si->node_blks); + seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", + si->hit_ext, si->total_ext); + seq_printf(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - nodes %4d in %4d\n", + si->ndirty_node, si->node_pages); + seq_printf(s, " - dents %4d in dirs:%4d\n", + si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - meta %4d in %4d\n", + si->ndirty_meta, si->meta_pages); + seq_printf(s, " - NATs %5d > %lu\n", + si->nats, NM_WOUT_THRESHOLD); + seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", + si->sits, si->fnids); + seq_printf(s, "\nDistribution of User Blocks:"); + seq_printf(s, " [ valid | invalid | free ]\n"); + seq_printf(s, " ["); + + for (j = 0; j < si->util_valid; j++) + seq_printf(s, "-"); + seq_printf(s, "|"); + + for (j = 0; j < si->util_invalid; j++) + seq_printf(s, "-"); + seq_printf(s, "|"); + + for (j = 0; j < si->util_free; j++) + seq_printf(s, "-"); + seq_printf(s, "]\n\n"); + seq_printf(s, "SSR: %u blocks in %u segments\n", + si->block_count[SSR], si->segment_count[SSR]); + seq_printf(s, "LFS: %u blocks in %u segments\n", + si->block_count[LFS], si->segment_count[LFS]); + + /* segment usage info */ + update_sit_info(si->sbi); + seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", + si->bimodal, si->avg_vblocks); + + /* memory footprint */ + update_mem_info(si->sbi); + seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", + (si->base_mem + si->cache_mem) >> 10, + si->base_mem >> 10, si->cache_mem >> 10); + mutex_unlock(&si->stat_lock); + } + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, stat_show, inode->i_private); +} + +static const struct file_operations stat_fops = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int init_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_stat_info *si; + + sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!sbi->stat_info) + return -ENOMEM; + + si = sbi->stat_info; + mutex_init(&si->stat_lock); + list_add_tail(&si->stat_list, &f2fs_stat_list); + + si->all_area_segs = le32_to_cpu(raw_super->segment_count); + si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); + si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); + si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa); + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + si->sbi = sbi; + return 0; +} + +int f2fs_build_stats(struct f2fs_sb_info *sbi) +{ + int retval; + + retval = init_stats(sbi); + if (retval) + return retval; + + if (!debugfs_root) + debugfs_root = debugfs_create_dir("f2fs", NULL); + + debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); + return 0; +} + +void f2fs_destroy_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = sbi->stat_info; + + list_del(&si->stat_list); + mutex_lock(&si->stat_lock); + si->sbi = NULL; + mutex_unlock(&si->stat_lock); + kfree(sbi->stat_info); +} + +void destroy_root_stats(void) +{ + debugfs_remove_recursive(debugfs_root); + debugfs_root = NULL; +} -- cgit v1.2.1 From a14d53937cc850d5631e0f809986751770ef65ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Nov 2012 17:25:27 +0900 Subject: f2fs: update Kconfig and Makefile This adds Makefile and Kconfig for f2fs, and updates Makefile and Kconfig files in the fs directory. Signed-off-by: Jaegeuk Kim --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/f2fs/Kconfig | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/Makefile | 7 +++++++ 4 files changed, 61 insertions(+) create mode 100644 fs/f2fs/Kconfig create mode 100644 fs/f2fs/Makefile (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index f95ae3a027f3..e352b3785eb2 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -220,6 +220,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" +source "fs/f2fs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 1d7af79288a0..9d53192236fc 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig new file mode 100644 index 000000000000..37a6b8e9438f --- /dev/null +++ b/fs/f2fs/Kconfig @@ -0,0 +1,52 @@ +config F2FS_FS + tristate "F2FS filesystem support (EXPERIMENTAL)" + help + F2FS is based on Log-structured File System (LFS), which supports + versatile "flash-friendly" features. The design has been focused on + addressing the fundamental issues in LFS, which are snowball effect + of wandering tree and high cleaning overhead. + + Since flash-based storages show different characteristics according to + the internal geometry or flash memory management schemes aka FTL, F2FS + and tools support various parameters not only for configuring on-disk + layout, but also for selecting allocation and cleaning algorithms. + + If unsure, say N. + +config F2FS_STAT_FS + bool "F2FS Status Information" + depends on F2FS_FS && DEBUG_FS + default y + help + /sys/kernel/debug/f2fs/ contains information about all the partitions + mounted as f2fs. Each file shows the whole f2fs information. + + /sys/kernel/debug/f2fs/status includes: + - major file system information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +config F2FS_FS_XATTR + bool "F2FS extended attributes" + depends on F2FS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config F2FS_FS_POSIX_ACL + bool "F2FS Access Control Lists" + depends on F2FS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + gourps beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile new file mode 100644 index 000000000000..27a0820340b9 --- /dev/null +++ b/fs/f2fs/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_F2FS_FS) += f2fs.o + +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o +f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o +f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -- cgit v1.2.1 From cf0e3a64cad19acd5904946d0647d751c1671620 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Tue, 27 Nov 2012 16:02:16 +0530 Subject: f2fs: remove unneeded version.h header file from f2fs.h Including is not necessary. Signed-off-by: Sachin Kamat --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7aa70b54172d..d3f5a70e2a49 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.1 From 25ca923b2a766b9c93b63777ead351137533a623 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Nov 2012 16:12:41 +0900 Subject: f2fs: fix endian conversion bugs reported by sparse This patch should resolve the bugs reported by the sparse tool. Initial reports were written by "kbuild test robot" managed by fengguang.wu. In my local machines, I've tested also by running: > make C=2 CF="-D__CHECK_ENDIAN__" Accordingly, I've found lots of warnings and bugs related to the endian conversion. And I've fixed all at this moment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 32 +++++++++++++++++--------------- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 25 +++++++++++++++++++++++-- fs/f2fs/hash.c | 3 +-- fs/f2fs/node.c | 4 ++-- fs/f2fs/node.h | 2 +- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 14 +++++++------- fs/f2fs/super.c | 8 ++++---- 11 files changed, 62 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ab743f92ee06..7c18f8efaadc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -268,7 +268,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blkaddr, i, j; - if (!(F2FS_CKPT(sbi)->ckpt_flags & CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return 0; sbi->por_doing = 1; @@ -287,7 +287,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - F2FS_CKPT(sbi)->ckpt_flags &= (~CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); sbi->por_doing = 0; return 0; } @@ -376,7 +376,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, pre_version = le64_to_cpu(cp_block->checkpoint_ver); /* Read the 2nd cp block in this CP pack */ - cp_addr += le64_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; cp_page_2 = get_meta_page(sbi, cp_addr); cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); @@ -605,8 +605,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; + unsigned int crc32 = 0; void *kaddr; - __u32 crc32 = 0; int i; /* Flush all the NAT/SIT pages */ @@ -646,33 +646,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi); if (data_sum_blocks < 3) - ckpt->ckpt_flags |= CP_COMPACT_SUM_FLAG; + set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - ckpt->ckpt_flags &= (~CP_COMPACT_SUM_FLAG); + clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) / F2FS_ORPHANS_PER_BLOCK; - ckpt->cp_pack_start_sum = 1 + orphan_blocks; - ckpt->cp_pack_total_block_count = 2 + data_sum_blocks + orphan_blocks; + ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); if (is_umount) { - ckpt->ckpt_flags |= CP_UMOUNT_FLAG; - ckpt->cp_pack_total_block_count += NR_CURSEG_NODE_TYPE; + set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { - ckpt->ckpt_flags &= (~CP_UMOUNT_FLAG); + clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + data_sum_blocks + orphan_blocks); } if (sbi->n_orphans) - ckpt->ckpt_flags |= CP_ORPHAN_PRESENT_FLAG; + set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); else - ckpt->ckpt_flags &= (~CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); - *(__u32 *)((unsigned char *)ckpt + + *(__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset)) = cpu_to_le32(crc32); @@ -716,7 +718,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) sbi->alloc_valid_block_count = 0; /* Here, we only have one bio having CP pack */ - if (sbi->ckpt->ckpt_flags & CP_ERROR_FLAG) + if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) sbi->sb->s_flags |= MS_RDONLY; else sync_meta_pages(sbi, META_FLUSH, LONG_MAX); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c2fd0a80db16..5635cc5a9d4d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -545,7 +545,7 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 -int f2fs_write_data_pages(struct address_space *mapping, +static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a56181c1b28b..fb62960a1dc1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -27,7 +27,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; -void update_general_status(struct f2fs_sb_info *sbi) +static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; int i; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5975568d03df..5ec7a06120e1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -80,7 +80,7 @@ static bool early_match_name(const char *name, int namelen, if (le16_to_cpu(de->name_len) != namelen) return false; - if (le32_to_cpu(de->hash_code) != namehash) + if (de->hash_code != namehash) return false; return true; @@ -143,7 +143,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, nbucket = dir_buckets(level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, namehash % nbucket); + bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -406,7 +406,7 @@ start: nbucket = dir_buckets(level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (dentry_hash % nbucket)); + bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { mutex_lock_op(sbi, DENTRY_OPS); @@ -437,7 +437,7 @@ add_dentry: wait_on_page_writeback(dentry_page); de = &dentry_blk->dentry[bit_pos]; - de->hash_code = cpu_to_le32(dentry_hash); + de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); memcpy(dentry_blk->filename[bit_pos], name, namelen); de->ino = cpu_to_le32(inode->i_ino); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d3f5a70e2a49..8d7fde1bda1e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -463,6 +463,26 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) sbi->s_dirty = 0; } +static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; +} + +static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags |= f; + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags &= (~f); + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) { mutex_lock_nested(&sbi->fs_lock[t], t); @@ -577,7 +597,8 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - int offset = (flag == NAT_BITMAP) ? ckpt->sit_ver_bitmap_bytesize : 0; + int offset = (flag == NAT_BITMAP) ? + le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; return &ckpt->sit_nat_version_bitmap + offset; } @@ -587,7 +608,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); - start_addr = le64_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); /* * odd numbered checkpoint should at cp segment 0 diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 098a1963d7c7..beb155e8d06d 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -92,7 +92,6 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len) hash = buf[0]; minor_hash = buf[1]; - f2fs_hash = hash; - f2fs_hash &= ~F2FS_HASH_COL_BIT; + f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); return f2fs_hash; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 216f04dc1177..5d421fe22575 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1445,8 +1445,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); dst->i.i_size = 0; - dst->i.i_blocks = 1; - dst->i.i_links = 1; + dst->i.i_blocks = cpu_to_le64(1); + dst->i.i_links = cpu_to_le32(1); dst->i.i_xattr_nid = 0; new_ni = old_ni; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 5d525ed312ba..0ab92d643052 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -177,7 +177,7 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) void *kaddr = page_address(page); struct f2fs_node *rn = (struct f2fs_node *)kaddr; rn->footer.cp_ver = ckpt->checkpoint_ver; - rn->footer.next_blkaddr = blkaddr; + rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline nid_t ino_of_node(struct page *node_page) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7a43df0b72c1..222a7bb92214 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -81,7 +81,7 @@ static int recover_inode(struct inode *inode, struct page *node_page) struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; struct f2fs_inode *raw_inode = &(raw_node->i); - inode->i_mode = le32_to_cpu(raw_inode->i_mode); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_size_write(inode, le64_to_cpu(raw_inode->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ed7c079cfc7f..d973c56e8bd6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -630,7 +630,7 @@ static void f2fs_end_io_write(struct bio *bio, int err) SetPageError(page); if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); - p->sbi->ckpt->ckpt_flags |= CP_ERROR_FLAG; + set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); set_page_dirty(page); } end_page_writeback(page); @@ -1067,7 +1067,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); - if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) + if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); @@ -1076,7 +1076,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); - if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) + if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else @@ -1087,7 +1087,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { - if (ckpt->ckpt_flags & CP_UMOUNT_FLAG) { + if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { @@ -1119,7 +1119,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; - if (sbi->ckpt->ckpt_flags & CP_COMPACT_SUM_FLAG) { + if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) return -EINVAL; @@ -1208,7 +1208,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (sbi->ckpt->ckpt_flags & CP_COMPACT_SUM_FLAG) + if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -1216,7 +1216,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) + if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); return; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8661c93538af..878bf382f848 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -89,7 +89,7 @@ static void f2fs_i_callback(struct rcu_head *head) kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); } -void f2fs_destroy_inode(struct inode *inode) +static void f2fs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, f2fs_i_callback); } @@ -445,7 +445,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (sanity_check_raw_super(raw_super)) goto free_sb_buf; - sb->s_maxbytes = max_file_size(raw_super->log_blocksize); + sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -527,7 +527,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* if there are nt orphan nodes free them */ err = -EINVAL; - if (!(sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) && + if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && recover_orphan_inodes(sbi)) goto free_node_inode; @@ -547,7 +547,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!(sbi->ckpt->ckpt_flags & CP_UMOUNT_FLAG) && + if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && !test_opt(sbi, DISABLE_ROLL_FORWARD)) recover_fsync_data(sbi); -- cgit v1.2.1 From 0a8165d7c2cf1395059db20ab07665baf3758fcd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Nov 2012 13:28:09 +0900 Subject: f2fs: adjust kernel coding style As pointed out by Randy Dunlap, this patch removes all usage of "/**" for comment blocks. Instead, just use "/*". Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/acl.h | 2 +- fs/f2fs/checkpoint.c | 10 +++++----- fs/f2fs/data.c | 12 ++++++------ fs/f2fs/debug.c | 6 +++--- fs/f2fs/dir.c | 4 ++-- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 10 +++++----- fs/f2fs/gc.h | 4 ++-- fs/f2fs/hash.c | 2 +- fs/f2fs/inode.c | 4 ++-- fs/f2fs/namei.c | 4 ++-- fs/f2fs/node.c | 22 +++++++++++----------- fs/f2fs/node.h | 2 +- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 26 +++++++++++++------------- fs/f2fs/segment.h | 2 +- fs/f2fs/super.c | 2 +- fs/f2fs/xattr.c | 2 +- fs/f2fs/xattr.h | 4 ++-- 21 files changed, 63 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index dff2a2bfa755..1ac9a4b24f6e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/acl.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index c97675e18fe2..80f430674417 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/acl.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7c18f8efaadc..6ef36c37e2be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -24,7 +24,7 @@ static struct kmem_cache *orphan_entry_slab; static struct kmem_cache *inode_entry_slab; -/** +/* * We guarantee no failure on the returned page. */ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) @@ -44,7 +44,7 @@ repeat: return page; } -/** +/* * We guarantee no failure on the returned page. */ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) @@ -543,7 +543,7 @@ retry: goto retry; } -/** +/* * Freeze all the FS-operations for checkpoint. */ void block_operations(struct f2fs_sb_info *sbi) @@ -727,7 +727,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) F2FS_RESET_SB_DIRT(sbi); } -/** +/* * We guarantee that this checkpoint procedure should not fail. */ void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5635cc5a9d4d..444c2a6fbaa0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -21,7 +21,7 @@ #include "node.h" #include "segment.h" -/** +/* * Lock ordering for the change of data block address: * ->data_page * ->node_page @@ -207,7 +207,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; } -/** +/* * If it tries to access a hole, return an error. * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. @@ -247,7 +247,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) return page; } -/** +/* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. */ @@ -322,7 +322,7 @@ static void read_end_io(struct bio *bio, int err) bio_put(bio); } -/** +/* * Fill the locked page with data located in the block address. * Read operation is synchronous, and caller must unlock the page. */ @@ -367,7 +367,7 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, return 0; } -/** +/* * This function should be used by the data read flow only where it * does not check the "create" flag that indicates block allocation. * The reason for this special functionality is to exploit VFS readahead diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb62960a1dc1..0e0380a588ad 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -1,4 +1,4 @@ -/** +/* * f2fs debugging statistics * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -78,7 +78,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) } } -/** +/* * This function calculates BDF of every segments */ static void update_sit_info(struct f2fs_sb_info *sbi) @@ -113,7 +113,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) si->avg_vblocks = 0; } -/** +/* * This function calculates memory footprint. */ static void update_mem_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5ec7a06120e1..089eb6766890 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -453,7 +453,7 @@ fail: return err; } -/** +/* * It only removes the dentry from the dentry page,corresponding name * entry in name page does not need to be touched during deletion. */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8d7fde1bda1e..8c3f1ef6ace2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f5ae36d19f4f..c1a108ffbfcc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/file.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 46774ce3ae03..3271be42c0b6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/gc.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -213,7 +213,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, return get_cb_cost(sbi, segno); } -/** +/* * This function is called from two pathes. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment @@ -359,7 +359,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, return ret ? GC_OK : GC_NEXT; } -/** +/* * This function compares node address got in summary with that in NAT. * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. @@ -425,7 +425,7 @@ next_step: return GC_DONE; } -/** +/* * Calculate start block index that this node page contains */ block_t start_bidx_of_node(unsigned int node_ofs) @@ -516,7 +516,7 @@ out: f2fs_put_page(page, 1); } -/** +/* * This function tries to get parent node of victim data block, and identifies * data block validity. If the block is valid, copy that with cold status and * modify parent node. diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index cf42a554ca0a..b026d9354ccd 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/gc.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -42,7 +42,7 @@ struct inode_entry { struct inode *inode; }; -/** +/* * inline functions */ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index beb155e8d06d..a60f04200f8b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/hash.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 94f13d2815e9..aa4ef4f48ffd 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/inode.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -235,7 +235,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } -/** +/* * Called at the last iput() if i_nlink is zero */ void f2fs_evict_inode(struct inode *inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index aec362f6f0b0..63efd77fab92 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/namei.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -96,7 +96,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) return ret; } -/** +/* * Set multimedia files as cold files for hot/cold data separation */ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5d421fe22575..25d303646da7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/node.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -81,7 +81,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/** +/* * Readahead NAT pages */ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) @@ -251,7 +251,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) return nr_shrink; } -/** +/* * This function returns always success */ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) @@ -302,7 +302,7 @@ cache: cache_nat_entry(NM_I(sbi), nid, &ne); } -/** +/* * The maximum depth is four. * Offset[0] will have raw inode offset. */ @@ -649,7 +649,7 @@ fail: return err; } -/** +/* * All the block addresses of data and nodes should be nullified. */ int truncate_inode_blocks(struct inode *inode, pgoff_t from) @@ -860,7 +860,7 @@ static int read_node_page(struct page *page, int type) return f2fs_readpage(sbi, page, ni.blk_addr, type); } -/** +/* * Readahead a node page */ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -910,7 +910,7 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) return page; } -/** +/* * Return a locked page for the desired node page. * And, readahead MAX_RA_NODE number of node pages. */ @@ -1186,7 +1186,7 @@ static int f2fs_release_node_page(struct page *page, gfp_t wait) return 0; } -/** +/* * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { @@ -1386,7 +1386,7 @@ retry: return true; } -/** +/* * alloc_nid() should be called prior to this function. */ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) @@ -1403,7 +1403,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_unlock(&nm_i->free_nid_list_lock); } -/** +/* * alloc_nid() should be called prior to this function. */ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) @@ -1545,7 +1545,7 @@ retry: return true; } -/** +/* * This function is called during the checkpointing process. */ void flush_nat_entries(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0ab92d643052..afdb130f782e 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 222a7bb92214..b07e9b6ef376 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/recovery.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d973c56e8bd6..a177eb387d38 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -36,7 +36,7 @@ static int need_to_flush(struct f2fs_sb_info *sbi) return 0; } -/** +/* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ @@ -105,7 +105,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, } } -/** +/* * Should not occur error such as -ENOMEM. * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. @@ -136,7 +136,7 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) return; } -/** +/* * Should call clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) @@ -269,7 +269,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) mutex_unlock(&sit_i->sentry_lock); } -/** +/* * This function should be resided under the curseg_mutex lock */ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, @@ -282,7 +282,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, return; } -/** +/* * Calculate the number of current summary pages for writing */ int npages_for_summary_flush(struct f2fs_sb_info *sbi) @@ -309,7 +309,7 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi) return 3; } -/** +/* * Caller should put this summary page */ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) @@ -371,7 +371,7 @@ next: return NULL_SEGNO; } -/** +/* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ @@ -483,7 +483,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } -/** +/* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ @@ -520,7 +520,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, seg->next_blkoff = ofs; } -/** +/* * If a segment is written by LFS manner, next block offset is just obtained * by increasing the current block offset. However, if a segment is written by * SSR manner, next block offset obtained by calling __next_free_blkoff @@ -534,7 +534,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, seg->next_blkoff++; } -/** +/* * This function always allocates a used segment (from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ @@ -1310,7 +1310,7 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) return 0; } -/** +/* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ @@ -1624,7 +1624,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) return init_victim_segmap(sbi); } -/** +/* * Update min, max modified time for cost-benefit GC algorithm */ static void init_min_max_mtime(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e380a8ef13f5..2c445f8947c9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 878bf382f848..4360600c81e8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/super.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aca50fe163f6..5324d1e9d168 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/xattr.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 29b0a08e1e14..49c9558305e3 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,4 +1,4 @@ -/** +/* * fs/f2fs/xattr.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. @@ -77,7 +77,7 @@ struct f2fs_xattr_entry { #define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) -/** +/* * On-disk structure of f2fs_xattr * We use only 1 block for xattr. * -- cgit v1.2.1 From 573ea5fcf0a65fea4811f82edd6dc6045c04edda Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Nov 2012 17:32:08 +0900 Subject: f2fs: resolve build failures There exist two build failures reported by Randy Dunlap as follows. (on i386) a. (config-r8857) ERROR: "f2fs_xattr_advise_handler" [fs/f2fs/f2fs.ko] undefined! Key configs in (config-r8857) are as follows. CONFIG_F2FS_FS=m # CONFIG_F2FS_STAT_FS is not set CONFIG_F2FS_FS_XATTR=y # CONFIG_F2FS_FS_POSIX_ACL is not set The error was occurred due to the function location that we made a mistake. Recently we added a new functionality for users to indicate cold files explicitly through xattr operations (i.e., f2fs_xattr_advise_handler). This handler should have been added in xattr.c instead of acl.c in order to avoid an undefined operation like in this case where XATTR is set and ACL is not set. b. (config-r8855) fs/f2fs/file.c: In function 'f2fs_vm_page_mkwrite': fs/f2fs/file.c:97:2: error: implicit declaration of function 'block_page_mkwrite_return' Key config in (config-r8855) is CONFIG_BLOCK. Obviously, f2fs works on top of the block device so that we should consider carefully a sort of config dependencies. The reason why this error was occurred was that f2fs_vm_page_mkwrite() calls block_page_mkwrite_return() which is enalbed only if CONFIG_BLOCK is set. Reported-by: Randy Dunlap Signed-off-by: Jaegeuk Kim Acked-by: Randy Dunlap --- fs/f2fs/Kconfig | 1 + fs/f2fs/acl.c | 51 --------------------------------------------------- fs/f2fs/xattr.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 37a6b8e9438f..fd27e7e6326e 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,5 +1,6 @@ config F2FS_FS tristate "F2FS filesystem support (EXPERIMENTAL)" + depends on BLOCK help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1ac9a4b24f6e..fed74d193ffb 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -412,54 +412,3 @@ const struct xattr_handler f2fs_xattr_acl_access_handler = { .get = f2fs_xattr_get_acl, .set = f2fs_xattr_set_acl, }; - -static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; - size_t size; - - if (type != F2FS_XATTR_INDEX_ADVISE) - return 0; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct inode *inode = dentry->d_inode; - - if (strcmp(name, "") != 0) - return -EINVAL; - - *((char *)buffer) = F2FS_I(inode)->i_advise; - return sizeof(char); -} - -static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!inode_owner_or_capable(inode)) - return -EPERM; - if (value == NULL) - return -EINVAL; - - F2FS_I(inode)->i_advise |= *(char *)value; - return 0; -} - -const struct xattr_handler f2fs_xattr_advise_handler = { - .prefix = F2FS_SYSTEM_ADVISE_PREFIX, - .flags = F2FS_XATTR_INDEX_ADVISE, - .list = f2fs_xattr_advise_list, - .get = f2fs_xattr_advise_get, - .set = f2fs_xattr_advise_set, -}; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 5324d1e9d168..7d52e8dc0c59 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -102,6 +102,49 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, return f2fs_setxattr(dentry->d_inode, type, name, value, size); } +static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; + size_t size; + + if (type != F2FS_XATTR_INDEX_ADVISE) + return 0; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + + *((char *)buffer) = F2FS_I(inode)->i_advise; + return sizeof(char); +} + +static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value == NULL) + return -EINVAL; + + F2FS_I(inode)->i_advise |= *(char *)value; + return 0; +} + const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, @@ -118,6 +161,14 @@ const struct xattr_handler f2fs_xattr_trusted_handler = { .set = f2fs_xattr_generic_set, }; +const struct xattr_handler f2fs_xattr_advise_handler = { + .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .flags = F2FS_XATTR_INDEX_ADVISE, + .list = f2fs_xattr_advise_list, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, +}; + static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL -- cgit v1.2.1 From be4124f8720ef83757a66caa46f6045f0292d1f4 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:55:12 +0900 Subject: f2fs: fix the compiler warning for uninitialized use of variable When CONFIG_CC_OPTIMIZE_FOR_SIZE is enabled in the kernel, -Os optimisation flag is passed to gcc for compilation, and somehow while trying to optimize the code, compiler is might not able to see the initialisation of variable ne struct variable inside the get_node_info() function and results into following warning: fs/f2fs/node.c: In function 'get_node_info': fs/f2fs/node.c:175:3: warning: 'ne.block_addr' may be used uninitialized in this function [-Wuninitialized] fs/f2fs/node.c:265:24: note: 'ne.block_addr' was declared here fs/f2fs/node.c:176:3: warning: 'ne.ino' may be used uninitialized in this function [-Wuninitialized] fs/f2fs/node.c:265:24: note: 'ne.ino' was declared here fs/f2fs/node.c:177:3: warning: 'ne.version' may be used uninitialized in this function [-Wuninitialized] fs/f2fs/node.c:265:24: note: 'ne.version' was declared here Hence, lets initialise the ne struct variable to zero, which will remove this warning and also doing this does not seems to making any impact on the code behavior. Signed-off-by: Namjae Jeon Signed-off-by: Pankaj Kumar --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 25d303646da7..19870361497e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -266,6 +266,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct nat_entry *e; int i; + memset(&ne, 0, sizeof(struct f2fs_nat_entry)); ni->nid = nid; /* Check nat cache */ -- cgit v1.2.1 From 72ce6094c0d3c1f0025eb118bb9406206f277479 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:55:32 +0900 Subject: f2fs: show error in case of invalid mount arguments print the invalid argument/value from parse_options in case of mount failure. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4360600c81e8..5830e537c376 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -311,6 +311,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: + pr_err("Unrecognized mount option \"%s\" or missing value\n", + p); return -EINVAL; } } -- cgit v1.2.1 From 154a086529b50236782a2d4365a1d1c359adc7af Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:55:45 +0900 Subject: f2fs: remove unneeded memset from init_once Since, __GFP_ZERO is used while f2fs inode allocation, so we do not need memset for f2fs_inode_info, as this is already zeroed out. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5830e537c376..13867322cf5a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,7 +57,6 @@ static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; - memset(fi, 0, sizeof(*fi)); inode_init_once(&fi->vfs_inode); } -- cgit v1.2.1 From 1fa95b0b6798164a31c1048efdf62b71038eb3d5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:56:01 +0900 Subject: f2fs: check read only condition before beginning write out If the filesystem is mounted as read-only then return from that point itself instead of first doing a writeout/wait and then checking for read-only condition. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c1a108ffbfcc..89241c50eb96 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -132,14 +132,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) .for_reclaim = 0, }; + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - if (inode->i_sb->s_flags & MS_RDONLY) - goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; -- cgit v1.2.1 From 1042d60f917d78ef1a6eaea297a1020484d4bf74 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:56:13 +0900 Subject: f2fs: remove unneeded initialization No need to initialize "struct f2fs_gc_kthread *gc_th = NULL", as gc_th = NULL, will be taken care by the return values of kmalloc(). And fix codes in other places. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3271be42c0b6..644aa3808273 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -89,7 +89,7 @@ static int gc_thread_func(void *data) int start_gc_thread(struct f2fs_sb_info *sbi) { - struct f2fs_gc_kthread *gc_th = NULL; + struct f2fs_gc_kthread *gc_th; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a177eb387d38..969df1a30d1c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1489,7 +1489,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) static int build_curseg(struct f2fs_sb_info *sbi) { - struct curseg_info *array = NULL; + struct curseg_info *array; int i; array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); @@ -1656,7 +1656,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct f2fs_sm_info *sm_info = NULL; + struct f2fs_sm_info *sm_info; int err; sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); -- cgit v1.2.1 From 61412b64b965af72798000c3c921e88db31216b1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 1 Dec 2012 10:56:25 +0900 Subject: f2fs: move error condition for mkdir at proper place In function f2fs_mkdir, err is being initialized without even checking if there was any error in new inode creation. So, instead check the inode error and make use of error/return condition. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 63efd77fab92..2d720ca47071 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -287,9 +287,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int err; inode = f2fs_new_inode(dir, S_IFDIR | mode); - err = PTR_ERR(inode); if (IS_ERR(inode)) - return err; + return PTR_ERR(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; -- cgit v1.2.1 From 705f814e34e08f6169439014a2916fd5afbdf232 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 2 Dec 2012 08:11:38 -0500 Subject: f2fs: remove unused variable The variables node_page and page_offset are initialized but never used otherwise, so remove those unused variables. Signed-off-by: Wei Yongjun --- fs/f2fs/dir.c | 2 -- fs/f2fs/file.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 089eb6766890..2a20c504ecd8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -509,13 +509,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (bit_pos == NR_DENTRY_IN_BLOCK) { - loff_t page_offset; truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); - page_offset = page->index << PAGE_CACHE_SHIFT; f2fs_put_page(page, 1); } else { f2fs_put_page(page, 1); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 89241c50eb96..f9e085dfb1f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -30,7 +30,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *node_page; block_t old_blk_addr; struct dnode_of_data dn; int err; @@ -50,7 +49,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, } old_blk_addr = dn.data_blkaddr; - node_page = dn.node_page; if (old_blk_addr == NULL_ADDR) { err = reserve_new_block(&dn); -- cgit v1.2.1 From c212991a6bc3ba120d41205a294c5b89f05f1535 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 8 Dec 2012 14:53:40 +0900 Subject: f2fs: rewrite f2fs_bio_alloc to make it simpler Since, GFP_NOFS(__GFP_WAIT) is used for allocation requests of bio in f2fs. So, there is no chance of returning NULL from the BIO allocation. Making the bio allocation routine for f2fs simpler. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/segment.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 969df1a30d1c..8894b399770d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -647,28 +647,18 @@ struct bio *f2fs_bio_alloc(struct block_device *bdev, sector_t first_sector, int nr_vecs, gfp_t gfp_flags) { struct bio *bio; -repeat: + /* allocate new bio */ bio = bio_alloc(gfp_flags, nr_vecs); - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - if (bio) { - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; retry: - bio->bi_private = kmalloc(sizeof(struct bio_private), - GFP_NOFS | __GFP_HIGH); - if (!bio->bi_private) { - cond_resched(); - goto retry; - } - } - if (bio == NULL) { + bio->bi_private = kmalloc(sizeof(struct bio_private), + GFP_NOFS | __GFP_HIGH); + if (!bio->bi_private) { cond_resched(); - goto repeat; + goto retry; } return bio; } -- cgit v1.2.1 From a0d42539e1d6c818222822dedd92a1674040e20c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 8 Dec 2012 14:54:18 +0900 Subject: f2fs: make use of GFP_F2FS_ZERO for setting gfp_mask Since, GFP_NOFS and __GFP_ZERO is being used to set gfp_mask. We can instead make use of already predefined macro GFP_F2FS_ZERO. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2d720ca47071..89b7675dc377 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -293,7 +293,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); err = f2fs_add_link(dentry, inode); -- cgit v1.2.1 From 508198be3c2f7f8929101bb0daeb8f0039c1dc7f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 8 Dec 2012 14:54:35 +0900 Subject: f2fs: remove redundant call to f2fs_put_page in delete entry Since, we anyway need to put the page after deleting entry. So, there is no need to make same call under different conditions. Move out the f2fs_put_page from the two conditions and call at once. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2a20c504ecd8..fc02d8b43aea 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -514,10 +514,9 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); - f2fs_put_page(page, 1); - } else { - f2fs_put_page(page, 1); } + f2fs_put_page(page, 1); + mutex_unlock_op(sbi, DENTRY_OPS); } -- cgit v1.2.1 From 457d08ee4fd91c8df17917ff2d32565e6adacbfc Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 8 Dec 2012 14:54:50 +0900 Subject: f2fs: introduce accessor to retrieve number of dentry slots Simplify code by providing the accessor macro to retrieve the number of dentry slots for a given filename length. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat --- fs/f2fs/dir.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fc02d8b43aea..d900c088c7c6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -99,8 +99,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { de = &dentry_blk->dentry[bit_pos]; - slots = (le16_to_cpu(de->name_len) + F2FS_NAME_LEN - 1) / - F2FS_NAME_LEN; + slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], @@ -130,7 +129,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, const char *name, int namelen, f2fs_hash_t namehash, struct page **res_page) { - int s = (namelen + F2FS_NAME_LEN - 1) / F2FS_NAME_LEN; + int s = GET_DENTRY_SLOTS(namelen); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; @@ -383,7 +382,7 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; - int slots = (namelen + F2FS_NAME_LEN - 1) / F2FS_NAME_LEN; + int slots = GET_DENTRY_SLOTS(namelen); int err = 0; int i; @@ -465,8 +464,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - int slots = (le16_to_cpu(dentry->name_len) + F2FS_NAME_LEN - 1) / - F2FS_NAME_LEN; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; @@ -641,8 +639,7 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) file->f_pos += bit_pos - start_bit_pos; goto success; } - slots = (le16_to_cpu(de->name_len) + F2FS_NAME_LEN - 1) - / F2FS_NAME_LEN; + slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); bit_pos += slots; } bit_pos = 0; -- cgit v1.2.1 From 3cd8a23948b29301f8f67b8d70c5c18fabbc05e1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Dec 2012 09:26:05 +0900 Subject: f2fs: cleanup the f2fs_bio_alloc routine Do cleanup more for better code readability. - Change the parameter set of f2fs_bio_alloc() This function should allocate a bio only since it is not something like f2fs_bio_init(). Instead, the caller should initialize the allocated bio. - Introduce SECTOR_FROM_BLOCK This macro translates a block address to its sector address. Signed-off-by: Jaegeuk Kim Reviewed-by: Namjae Jeon --- fs/f2fs/data.c | 5 +++-- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 ++++++++++++++++++--------------- fs/f2fs/segment.h | 3 +++ 4 files changed, 25 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 444c2a6fbaa0..655aeabc1dd4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -343,11 +343,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, down_read(&sbi->bio_sem); /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, blk_addr << (sbi->log_blocksize - 9), - 1, GFP_NOFS | __GFP_HIGH); + bio = f2fs_bio_alloc(bdev, 1); /* Initialize the bio */ + bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = read_end_io; + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { kfree(bio->bi_private); bio_put(bio); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c3f1ef6ace2..2bce3a62c0ba 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -924,7 +924,7 @@ void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, sector_t, int, gfp_t); +struct bio *f2fs_bio_alloc(struct block_device *, int); void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); int write_meta_page(struct f2fs_sb_info *, struct page *, struct writeback_control *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8894b399770d..1b26e4ea1016 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -643,23 +643,21 @@ static void f2fs_end_io_write(struct bio *bio, int err) bio_put(bio); } -struct bio *f2fs_bio_alloc(struct block_device *bdev, sector_t first_sector, - int nr_vecs, gfp_t gfp_flags) +struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) { struct bio *bio; - - /* allocate new bio */ - bio = bio_alloc(gfp_flags, nr_vecs); - - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + struct bio_private *priv; retry: - bio->bi_private = kmalloc(sizeof(struct bio_private), - GFP_NOFS | __GFP_HIGH); - if (!bio->bi_private) { + priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); + if (!priv) { cond_resched(); goto retry; } + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + bio->bi_bdev = bdev; + bio->bi_private = priv; return bio; } @@ -711,10 +709,15 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) do_submit_bio(sbi, type, false); alloc_new: - if (sbi->bio[type] == NULL) - sbi->bio[type] = f2fs_bio_alloc(bdev, - blk_addr << (sbi->log_blocksize - 9), - bio_get_nr_vecs(bdev), GFP_NOFS | __GFP_HIGH); + if (sbi->bio[type] == NULL) { + sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); + sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + /* + * The end_io will be assigned at the sumbission phase. + * Until then, let bio_add_page() merge consecutive IOs as much + * as possible. + */ + } if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 2c445f8947c9..0948405af6f5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -82,6 +82,9 @@ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) +#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ + (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + /* during checkpoint, bio_private is used to synchronize the last bio */ struct bio_private { struct f2fs_sb_info *sbi; -- cgit v1.2.1 From 6666e6aa9f36b2bfd6b30072c07b34f2a24becf1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Dec 2012 17:52:48 +0900 Subject: f2fs: fix tracking parent inode number Previously, f2fs didn't track the parent inode number correctly which is stored in each f2fs_inode. In the case of the following scenario, a bug can be occured. Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a". - pino of "/a" is ROOT_INO. - pino of "/b/a" is DIR_B_INO. Then, # sync : The inode pages of "/a" and "/b/a" contain the parent inode numbers as ROOT_INO and DIR_B_INO respectively. # mv /a /b/a : The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs didn't do that. Ref. f2fs_set_link(). In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is updated temporarily in f2fs_inode_info. And later, f2fs_write_inode() stores the latest information to the inode pages. For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode(). Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 10 ++++++++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d900c088c7c6..b4e24f32b54e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -256,13 +256,16 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); + + /* update parent inode number before releasing dentry page */ + F2FS_I(inode)->i_pino = dir->i_ino; + f2fs_put_page(page, 1); mutex_unlock_op(sbi, DENTRY_OPS); } void init_dent_inode(struct dentry *dentry, struct page *ipage) { - struct inode *dir = dentry->d_parent->d_inode; struct f2fs_node *rn; if (IS_ERR(ipage)) @@ -272,7 +275,6 @@ void init_dent_inode(struct dentry *dentry, struct page *ipage) /* copy dentry info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); - rn->i.i_pino = cpu_to_le32(dir->i_ino); rn->i.i_namelen = cpu_to_le32(dentry->d_name.len); memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len); set_page_dirty(ipage); @@ -444,7 +446,11 @@ add_dentry: for (i = 0; i < slots; i++) test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); set_page_dirty(dentry_page); + update_parent_metadata(dir, inode, current_depth); + + /* update parent inode number before releasing dentry page */ + F2FS_I(inode)->i_pino = dir->i_ino; fail: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2bce3a62c0ba..a18d63db2fb6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -136,6 +136,7 @@ struct f2fs_inode_info { unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned int i_current_depth; /* use only in directory structure */ + unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index aa4ef4f48ffd..df5fb381ebf1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; + fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); f2fs_put_page(node_page, 1); return 0; @@ -200,6 +201,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); + ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); set_page_dirty(node_page); } -- cgit v1.2.1 From bd9926e80330d43f15b710c2935fa41b792d56fd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 11 Dec 2012 03:31:49 -0500 Subject: ext4: zero out inline data using memset() instead of empty_zero_page Not all architectures (in particular, sparc64) have empty_zero_page. So instead of copying from empty_zero_page, use memset to clear the inline data by signalling to ext4_xattr_set_entry() via a magic pointer value, EXT4_ZERO_ATTR_VALUE, which is defined by casting -1 to a pointer. This fixes a build failure on sparc64, and the memset() should be more efficient than using memcpy() anyway. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 2 +- fs/ext4/xattr.c | 22 ++++++++++++++++------ fs/ext4/xattr.h | 1 + 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 53b2f65091dd..387c47c6cda9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -268,7 +268,7 @@ static int ext4_create_inline_data(handle_t *handle, goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { - value = (void *)empty_zero_page; + value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2251769a3c53..3a91ebc2b66f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -628,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size. Just replace. */ s->here->e_value_size = cpu_to_le32(i->value_len); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } return 0; } @@ -669,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear the pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } } } return 0; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7b5513ed3b38..69eda787a96a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -71,6 +71,7 @@ struct ext4_xattr_entry { #define BFIRST(bh) ENTRY(BHDR(bh)+1) #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { int name_index; -- cgit v1.2.1 From c1ad41f1f7270c1956da13fa8fd59d8d5929d56e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Dec 2012 10:23:45 +0100 Subject: Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" This reverts commit 5258f386ea4e8454bc801fb443e8a4217da1947c, because the underlying autogroups bug got fixed upstream in a better way, via: fd8ef11730f1 Revert "sched, autogroup: Stop going ahead if autogroup is disabled" Cc: Mike Galbraith Cc: Yong Zhang Cc: Peter Zijlstra Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 587631e1cd06..9e28356a959a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1272,6 +1272,81 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2582,6 +2657,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -- cgit v1.2.1 From 7d3e91a89b7adbc2831334def9e494dd9892f9af Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Sat, 8 Dec 2012 15:30:18 +0100 Subject: NFSv4: Check for buffer length in __nfs4_get_acl_uncached Commit 1f1ea6c "NFSv4: Fix buffer overflow checking in __nfs4_get_acl_uncached" accidently dropped the checking for too small result buffer length. If someone uses getxattr on "system.nfs4_acl" on an NFSv4 mount supporting ACLs, the ACL has not been cached and the buffer suplied is too short, we still copy the complete ACL, resulting in kernel and user space memory corruption. Signed-off-by: Sven Wegener Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5eec4429970c..05e5f6f9f2b8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3937,8 +3937,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu goto out_free; } nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); - if (buf) + if (buf) { + if (res.acl_len > buflen) { + ret = -ERANGE; + goto out_free; + } _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); + } out_ok: ret = res.acl_len; out_free: -- cgit v1.2.1 From 81d9bce5309288086b58b4d97a644e495fef75f2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Dec 2012 09:25:48 -0500 Subject: nfs: don't extend writes to cover entire page if pagecache is invalid Jian reported that the following sequence would leave "testfile" with corrupt data: # mount localhost:/export /mnt/nfs/ -o vers=3 # echo abc > /mnt/nfs/testfile; echo def >> /export/testfile; echo ghi >> /mnt/nfs/testfile # cat -v /export/testfile abc ^@^@^@^@ghi While there's no locking involved here, the operations are serialized, so CTO should prevent corruption. The first write to the file is fine and writes 4 bytes. The file is then extended on the server. When it's reopened a GETATTR is issued and the size change is noticed. This causes NFS_INO_INVALID_DATA to be set on the file. Because the file is opened for write only, nfs_want_read_modify_write() returns 0 to nfs_write_begin(). nfs_updatepage then calls nfs_write_pageuptodate() to see if it should extend the nfs_page to cover the whole page. NFS_INO_INVALID_DATA is still set on the file at that point, but that flag is ignored and nfs_pageuptodate erroneously extends the write to cover the whole page, with the write done on the server side filled in with zeroes. This patch just has that function check for NFS_INO_INVALID_DATA in addition to NFS_INO_REVAL_PAGECACHE. This fixes the bug, but looking over the code, I wonder if we might have a similar bug in nfs_revalidate_size(). The difference between those two flags is very subtle, so it seems like we ought to be checking for NFS_INO_INVALID_DATA in most of the places that we look for NFS_INO_REVAL_PAGECACHE. I believe this is regression introduced by commit 8d197a568. The code did check for NFS_INO_INVALID_DATA prior to that patch. Original bug report is here: https://bugzilla.redhat.com/show_bug.cgi?id=885743 Cc: # 3.5+ Reported-by: Jian Li Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f710e39f6ba2..eecd8b879afe 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -884,7 +884,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) { if (nfs_have_delegated_attributes(inode)) goto out; - if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) return false; out: return PageUptodate(page) != 0; -- cgit v1.2.1 From 85563073741bd7935a6900d567ddaf907192270d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Dec 2012 10:31:12 -0500 Subject: NFSv4.1: Handle NFS4ERR_BADSLOT errors correctly Most (all) NFS4ERR_BADSLOT errors are due to the client failing to respect the server's sr_highest_slotid limit. This mainly happens due to reordered RPC requests. The way to handle it is simply to drop the slot that we're using, and retry using the new highest_slotid limits. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92bd799eee01..a4692e97bc19 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -422,6 +422,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * struct nfs4_slot *slot; unsigned long timestamp; struct nfs_client *clp; + int ret = 1; /* * sr_status remains 1 if an RPC level error occurred. The server @@ -462,6 +463,16 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * slot->slot_nr, slot->seq_nr); goto out_retry; + case -NFS4ERR_BADSLOT: + /* + * The slot id we used was probably retired. Try again + * using a different slot id. + */ + if (rpc_restart_call_prepare(task)) { + task->tk_status = 0; + ret = 0; + } + break; default: /* Just update the slot sequence no. */ ++slot->seq_nr; @@ -470,7 +481,7 @@ out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); nfs41_sequence_free_slot(res); - return 1; + return ret; out_retry: if (!rpc_restart_call(task)) goto out; -- cgit v1.2.1 From af402ab2b0369c2b1acf4cde72c5ed5050c74e5b Mon Sep 17 00:00:00 2001 From: Idan Kedar Date: Fri, 30 Nov 2012 16:03:31 +0200 Subject: exofs: clean up the correct page collection on write error if ore_write() fails, we would unlock the pages of pcol, which is now empty, rather than pcol_copy which owns the pages when ore_write() is called. this means that no pages will actually be unlocked (pcol.nr_pages == 0) and the writing process (more accurately, the syncing process) will hang waiting for a writeback notification that never comes. moreover, if ore_write() fails, pcol_free() is called for pcol, whereas pcol_copy is the object owning the ore_io_state, thus leaking the ore_io_state. [Boaz] I have simplified Idan's original patch a bit, everything else still holds Signed-off-by: Idan Kedar Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index b56181047751..1634b946565f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -676,8 +676,10 @@ static int write_exec(struct page_collect *pcol) return 0; err: - _unlock_pcol_pages(pcol, ret, WRITE); - pcol_free(pcol); + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, WRITE); + pcol_free(pcol_copy); kfree(pcol_copy); return ret; -- cgit v1.2.1 From b0ef9647a0cd6cfd63fed48fbbe6005e4ba92571 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Dec 2012 12:10:14 -0500 Subject: NFSv4.1: Be conservative about the client highest slotid If the server sends us a target that looks like an outlier, but is lower than the existing target, then respect it anyway. However defer actually updating the generation counter until we get a target that doesn't look like an outlier. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4session.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index ed5aa9fa9c7b..1e6c87c443a7 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -273,20 +273,28 @@ void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) } } +static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + u32 max_slotid; + + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid); + if (max_slotid > tbl->server_highest_slotid) + max_slotid = tbl->server_highest_slotid; + if (max_slotid > tbl->target_highest_slotid) + max_slotid = tbl->target_highest_slotid; + tbl->max_slotid = max_slotid; + nfs41_wake_slot_table(tbl); +} + /* Update the client's idea of target_highest_slotid */ static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, u32 target_highest_slotid) { - unsigned int max_slotid; - if (tbl->target_highest_slotid == target_highest_slotid) return; tbl->target_highest_slotid = target_highest_slotid; tbl->generation++; - - max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, tbl->target_highest_slotid); - tbl->max_slotid = max_slotid; - nfs41_wake_slot_table(tbl); } void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, @@ -296,6 +304,7 @@ void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, nfs41_set_target_slotid_locked(tbl, target_highest_slotid); tbl->d_target_highest_slotid = 0; tbl->d2_target_highest_slotid = 0; + nfs41_set_max_slotid_locked(tbl, target_highest_slotid); spin_unlock(&tbl->slot_tbl_lock); } @@ -370,6 +379,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); if (tbl->generation == slot->generation) nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); + nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); spin_unlock(&tbl->slot_tbl_lock); } -- cgit v1.2.1 From 193cdd8a293007d1a1ad252cf66b2dc5b793d2d0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Dec 2012 06:10:44 -0500 Subject: cifs: fix SID binary to string conversion The authority fields are supposed to be represented by a single 48-bit value. It's also supposed to represent the value as hex if it's equal to or greater than 2^32. This is documented in MS-DTYP, section 2.4.2.1. Also, fix up the max string length to account for this fix. Acked-by: Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 25 +++++++++++++++++++------ fs/cifs/cifsacl.h | 8 +++++--- 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 8dd9212ffef5..75c1ee699143 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -94,6 +94,7 @@ sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) int i, len; unsigned int saval; char *sidstr, *strptr; + unsigned long long id_auth_val; /* 3 bytes for prefix */ sidstr = kmalloc(3 + SID_STRING_BASE_SIZE + @@ -107,12 +108,24 @@ sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) sidptr->revision); strptr += len; - for (i = 0; i < NUM_AUTHS; ++i) { - if (sidptr->authority[i]) { - len = sprintf(strptr, "-%hhu", sidptr->authority[i]); - strptr += len; - } - } + /* The authority field is a single 48-bit number */ + id_auth_val = (unsigned long long)sidptr->authority[5]; + id_auth_val |= (unsigned long long)sidptr->authority[4] << 8; + id_auth_val |= (unsigned long long)sidptr->authority[3] << 16; + id_auth_val |= (unsigned long long)sidptr->authority[2] << 24; + id_auth_val |= (unsigned long long)sidptr->authority[1] << 32; + id_auth_val |= (unsigned long long)sidptr->authority[0] << 48; + + /* + * MS-DTYP states that if the authority is >= 2^32, then it should be + * expressed as a hex value. + */ + if (id_auth_val <= UINT_MAX) + len = sprintf(strptr, "-%llu", id_auth_val); + else + len = sprintf(strptr, "-0x%llx", id_auth_val); + + strptr += len; for (i = 0; i < sidptr->num_subauth; ++i) { saval = le32_to_cpu(sidptr->sub_auth[i]); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index a445405f80d0..4f3884835267 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -55,12 +55,14 @@ * u8: max 3 bytes in decimal * u32: max 10 bytes in decimal * - * "S-" + 3 bytes for version field + 4 bytes for each authority field (3 bytes - * per number + 1 for '-') + NULL terminator. + * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator + * + * For authority field, max is when all 6 values are non-zero and it must be + * represented in hex. So "-0x" + 12 hex digits. * * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') */ -#define SID_STRING_BASE_SIZE (2 + 3 + (4 * NUM_AUTHS) + 1) +#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) #define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ struct cifs_ntsd { -- cgit v1.2.1 From 62a1a439e0fdd4ec8a80dc00fcbb9f26b5c34de1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Dec 2012 06:10:45 -0500 Subject: cifs: clean up handling of unc= option Make sure we free any existing memory allocated for vol->UNC, just in case someone passes in multiple unc= options. Get rid of the check for too long a UNC. The check for >300 bytes seems arbitrary. We later copy this into the tcon->treeName, for instance and it's a lot shorter than 300 bytes. Eliminate an extra kmalloc and copy as well. Just set the vol->UNC directly with the contents of match_strdup. Establish that the UNC should be stored with '\\' delimiters. Use convert_delimiter to change it in place in the vol->UNC. Finally, move the check for a malformed UNC into cifs_parse_mount_options so we can catch that situation earlier. Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f3276239e075..9c5c8b8c19fe 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1566,29 +1566,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, got_ip = true; break; case Opt_unc: - string = match_strdup(args); - if (string == NULL) + kfree(vol->UNC); + vol->UNC = match_strdup(args); + if (vol->UNC == NULL) goto out_nomem; - temp_len = strnlen(string, 300); - if (temp_len == 300) { - printk(KERN_WARNING "CIFS: UNC name too long\n"); - goto cifs_parse_mount_err; - } - - vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->UNC == NULL) { - printk(KERN_WARNING "CIFS: no memory for UNC\n"); - goto cifs_parse_mount_err; - } - strcpy(vol->UNC, string); - - if (strncmp(string, "//", 2) == 0) { - vol->UNC[0] = '\\'; - vol->UNC[1] = '\\'; - } else if (strncmp(string, "\\\\", 2) != 0) { + convert_delimiter(vol->UNC, '\\'); + if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { printk(KERN_WARNING "CIFS: UNC Path does not " - "begin with // or \\\\\n"); + "begin with // or \\\\\n"); goto cifs_parse_mount_err; } @@ -1813,6 +1799,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } + /* make sure UNC has a share name */ + if (!strchr(vol->UNC + 3, '\\')) { + cERROR(1, "Malformed UNC. Unable to find share name."); + goto cifs_parse_mount_err; + } + if (!got_ip) { /* No ip= option specified? Try to get it from UNC */ if (!cifs_convert_address(dstaddr, &vol->UNC[2], @@ -2575,13 +2567,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } - if (strchr(volume_info->UNC + 3, '\\') == NULL - && strchr(volume_info->UNC + 3, '/') == NULL) { - cERROR(1, "Missing share name"); - rc = -ENODEV; - goto out_fail; - } - /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? -- cgit v1.2.1 From 839db3d10a5ba792d6533b8bb3380f52ac877344 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Dec 2012 06:10:45 -0500 Subject: cifs: fix up handling of prefixpath= option Currently the code takes care to ensure that the prefixpath has a leading '/' delimiter. What if someone passes us a prefixpath with a leading '\\' instead? The code doesn't properly handle that currently AFAICS. Let's just change the code to skip over any leading delimiter character when copying the prepath. Then, fix up the users of the prepath option to prefix it with the correct delimiter when they use it. Also, there's no need to limit the length of the prefixpath to 1k. If the server can handle it, why bother forbidding it? Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 34 +++++++++------------------------- fs/cifs/dir.c | 5 +++-- 2 files changed, 12 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9c5c8b8c19fe..94c4484c9ea3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1612,31 +1612,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } break; case Opt_prefixpath: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - - temp_len = strnlen(string, 1024); - if (string[0] != '/') - temp_len++; /* missing leading slash */ - if (temp_len > 1024) { - printk(KERN_WARNING "CIFS: prefix too long\n"); - goto cifs_parse_mount_err; - } - - vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->prepath == NULL) { - printk(KERN_WARNING "CIFS: no memory " - "for path prefix\n"); - goto cifs_parse_mount_err; - } - - if (string[0] != '/') { - vol->prepath[0] = '/'; - strcpy(vol->prepath+1, string); - } else - strcpy(vol->prepath, string); + /* skip over any leading delimiter */ + if (*args[0].from == '/' || *args[0].from == '\\') + args[0].from++; + kfree(vol->prepath); + vol->prepath = match_strdup(args); + if (vol->prepath == NULL) + goto out_nomem; break; case Opt_iocharset: string = match_strdup(args); @@ -3236,7 +3219,7 @@ build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) { char *full_path, *pos; - unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); @@ -3247,6 +3230,7 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { + *pos++ = CIFS_DIR_SEP(cifs_sb); strncpy(pos, vol->prepath, pplen); pos += pplen; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3b7e0c1266f7..8719bbe0dcc3 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -48,7 +48,7 @@ char * cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon) { - int pplen = vol->prepath ? strlen(vol->prepath) : 0; + int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; int dfsplen; char *full_path = NULL; @@ -69,7 +69,8 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, if (dfsplen) strncpy(full_path, tcon->treeName, dfsplen); - strncpy(full_path + dfsplen, vol->prepath, pplen); + full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); + strncpy(full_path + dfsplen + 1, vol->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); full_path[dfsplen + pplen] = 0; /* add trailing null */ return full_path; -- cgit v1.2.1 From d387a5c50bca619d56f276a69627c2e1c6e5c548 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Dec 2012 06:10:46 -0500 Subject: cifs: parse the device name into UNC and prepath This should fix a regression that was introduced when the new mount option parser went in. Also, when the unc= and prefixpath= options are provided, check their values against the ones we parsed from the device string. If they differ, then throw a warning that tells the user that we're using the values from the unc= option for now, but that that will change in 3.10. Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 94c4484c9ea3..7635b5db26a7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1096,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) return 0; } +/* + * Parse a devname into substrings and populate the vol->UNC and vol->prepath + * fields with the result. Returns 0 on success and an error otherwise. + */ +static int +cifs_parse_devname(const char *devname, struct smb_vol *vol) +{ + char *pos; + const char *delims = "/\\"; + size_t len; + + /* make sure we have a valid UNC double delimiter prefix */ + len = strspn(devname, delims); + if (len != 2) + return -EINVAL; + + /* find delimiter between host and sharename */ + pos = strpbrk(devname + 2, delims); + if (!pos) + return -EINVAL; + + /* skip past delimiter */ + ++pos; + + /* now go until next delimiter or end of string */ + len = strcspn(pos, delims); + + /* move "pos" up to delimiter or NULL */ + pos += len; + vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); + if (!vol->UNC) + return -ENOMEM; + + convert_delimiter(vol->UNC, '\\'); + + /* If pos is NULL, or is a bogus trailing delimiter then no prepath */ + if (!*pos++ || !*pos) + return 0; + + vol->prepath = kstrdup(pos, GFP_KERNEL); + if (!vol->prepath) + return -ENOMEM; + + return 0; +} + static int cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) @@ -1181,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ + /* + * For now, we ignore -EINVAL errors under the assumption that the + * unc= and prefixpath= options will be usable. + */ + if (cifs_parse_devname(devname, vol) == -ENOMEM) { + printk(KERN_ERR "CIFS: Unable to allocate memory to parse " + "device string.\n"); + goto out_nomem; + } + while ((data = strsep(&options, separator)) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; @@ -1566,18 +1622,31 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, got_ip = true; break; case Opt_unc: - kfree(vol->UNC); + string = vol->UNC; vol->UNC = match_strdup(args); - if (vol->UNC == NULL) + if (vol->UNC == NULL) { + kfree(string); goto out_nomem; + } convert_delimiter(vol->UNC, '\\'); if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { - printk(KERN_WARNING "CIFS: UNC Path does not " + kfree(string); + printk(KERN_ERR "CIFS: UNC Path does not " "begin with // or \\\\\n"); goto cifs_parse_mount_err; } + /* Compare old unc= option to new one */ + if (!string || strcmp(string, vol->UNC)) + printk(KERN_WARNING "CIFS: the value of the " + "unc= mount option does not match the " + "device string. Using the unc= option " + "for now. In 3.10, that option will " + "be ignored and the contents of the " + "device string will be used " + "instead. (%s != %s)\n", string, + vol->UNC); break; case Opt_domain: string = match_strdup(args); @@ -1616,10 +1685,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (*args[0].from == '/' || *args[0].from == '\\') args[0].from++; - kfree(vol->prepath); + string = vol->prepath; vol->prepath = match_strdup(args); - if (vol->prepath == NULL) + if (vol->prepath == NULL) { + kfree(string); goto out_nomem; + } + /* Compare old prefixpath= option to new one */ + if (!string || strcmp(string, vol->prepath)) + printk(KERN_WARNING "CIFS: the value of the " + "prefixpath= mount option does not " + "match the device string. Using the " + "prefixpath= option for now. In 3.10, " + "that option will be ignored and the " + "contents of the device string will be " + "used instead.(%s != %s)\n", string, + vol->prepath); break; case Opt_iocharset: string = match_strdup(args); @@ -1777,8 +1858,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } #endif if (!vol->UNC) { - cERROR(1, "CIFS mount error: No UNC path (e.g. -o " - "unc=\\\\192.168.1.100\\public) specified"); + cERROR(1, "CIFS mount error: No usable UNC path provided in " + "device string or in unc= option!"); goto cifs_parse_mount_err; } -- cgit v1.2.1 From c299dd0e2d3dd61d0048a9d9b021aa01f023ed0c Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 6 Dec 2012 22:07:52 +0400 Subject: CIFS: Fix write after setting a read lock for read oplock files If we have a read oplock and set a read lock in it, we can't write to the locked area - so, filemap_fdatawrite may fail with a no information for a userspace application even if we request a write to non-locked area. Fix this by populating the page cache without marking affected pages dirty after a successful write directly to the server. Also remove CONFIG_CIFS_SMB2 ifdefs because it's suitable for both CIFS and SMB2 protocols. Signed-off-by: Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 + fs/cifs/cifsglob.h | 1 + fs/cifs/file.c | 94 ++++++++++++++++++++++++++++++++++++------------------ 3 files changed, 65 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c6e32f22fbd3..210f0af83fc4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -229,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + cifs_inode->leave_pages_clean = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index dfab450a191e..aea1eec64911 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1030,6 +1030,7 @@ struct cifsInodeInfo { bool clientCanCacheAll; /* read and writebehind oplock */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1b322d041f1e..0a6677ba212b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2103,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } else { rc = copied; pos += copied; - set_page_dirty(page); + /* + * When we use strict cache mode and cifs_strict_writev was run + * with level II oplock (indicated by leave_pages_clean field of + * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev + * sent the data to the server itself. + */ + if (!CIFS_I(inode)->leave_pages_clean || + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) + set_page_dirty(page); } if (rc > 0) { @@ -2454,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, bool cache_ex) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2477,8 +2485,12 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); + if (!cache_ex) + cinode->leave_pages_clean = true; rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); + &iocb->ki_pos); + if (!cache_ex) + cinode->leave_pages_clean = false; mutex_unlock(&inode->i_mutex); } @@ -2505,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - -#ifdef CONFIG_CIFS_SMB2 + ssize_t written, written2; /* - * If we have an oplock for read and want to write a data to the file - * we need to store it in the page cache and then push it to the server - * to be sure the next read will get a valid data. + * We need to store clientCanCacheAll here to prevent race + * conditions - this value can be changed during an execution + * of generic_file_aio_write. For CIFS it can be changed from + * true to false only, but for SMB2 it can be changed both from + * true to false and vice versa. So, we can end up with a data + * stored in the cache, not marked dirty and not sent to the + * server if this value changes its state from false to true + * after cifs_write_end. */ - if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { - ssize_t written; - int rc; - - written = generic_file_aio_write(iocb, iov, nr_segs, pos); - rc = filemap_fdatawrite(inode->i_mapping); - if (rc) - return (ssize_t)rc; + bool cache_ex = cinode->clientCanCacheAll; + bool cache_read = cinode->clientCanCacheRead; + int rc; + loff_t saved_pos; - return written; + if (cache_ex) { + if (cap_unix(tcon->ses) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( + tcon->fsUnixInfo.Capability))) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); } -#endif /* - * For non-oplocked files in strict cache mode we need to write the data - * to the server exactly from the pos to pos+len-1 rather than flush all - * affected pages because it may cause a error with mandatory locks on - * these pages but not on the region from pos to ppos+len-1. + * For files without exclusive oplock in strict cache mode we need to + * write the data to the server exactly from the pos to pos+len-1 rather + * than flush all affected pages because it may cause a error with + * mandatory locks on these pages but not on the region from pos to + * ppos+len-1. */ + written = cifs_user_writev(iocb, iov, nr_segs, pos); + if (!cache_read || written <= 0) + return written; - if (!cinode->clientCanCacheAll) - return cifs_user_writev(iocb, iov, nr_segs, pos); - + saved_pos = iocb->ki_pos; + iocb->ki_pos = pos; + /* we have a read oplock - need to store a data in the page cache */ if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - - return cifs_writev(iocb, iov, nr_segs, pos); + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( + tcon->fsUnixInfo.Capability))) + written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); + else + written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, + cache_ex); + /* errors occured during writing - invalidate the page cache */ + if (written2 < 0) { + rc = cifs_invalidate_mapping(inode); + if (rc) + written = (ssize_t)rc; + else + iocb->ki_pos = saved_pos; + } + return written; } static struct cifs_readdata * -- cgit v1.2.1 From d8153d4d8b7b6141770e1416c4a338161205ed1b Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:45 +0200 Subject: inotify, fanotify: replace fsnotify_put_group() with fsnotify_destroy_group() Currently in fsnotify_put_group() the ref count of a group is decremented and if it becomes 0 fsnotify_destroy_group() is called. Since a groups ref count is only at group creation set to 1 and never increased after that a call to fsnotify_put_group() always results in a call to fsnotify_destroy_group(). With this patch fsnotify_destroy_group() is called directly. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 14 +++++++------- fs/notify/group.c | 2 +- fs/notify/inotify/inotify_user.c | 8 +++----- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d43803669739..82ae6d783c14 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -415,7 +415,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) wake_up(&group->fanotify_data.access_waitq); #endif /* matches the fanotify_init->fsnotify_alloc_group */ - fsnotify_put_group(group); + fsnotify_destroy_group(group); return 0; } @@ -728,13 +728,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) break; default: fd = -EINVAL; - goto out_put_group; + goto out_destroy_group; } if (flags & FAN_UNLIMITED_QUEUE) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) - goto out_put_group; + goto out_destroy_group; group->max_events = UINT_MAX; } else { group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; @@ -743,7 +743,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & FAN_UNLIMITED_MARKS) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) - goto out_put_group; + goto out_destroy_group; group->fanotify_data.max_marks = UINT_MAX; } else { group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; @@ -751,12 +751,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); if (fd < 0) - goto out_put_group; + goto out_destroy_group; return fd; -out_put_group: - fsnotify_put_group(group); +out_destroy_group: + fsnotify_destroy_group(group); return fd; } diff --git a/fs/notify/group.c b/fs/notify/group.c index 63fc294a4692..cfda328c3d11 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -50,7 +50,7 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group) * situtation, the fsnotify_final_destroy_group will get called when that final * mark is freed. */ -static void fsnotify_destroy_group(struct fsnotify_group *group) +void fsnotify_destroy_group(struct fsnotify_group *group) { /* clear all inode marks for this group */ fsnotify_clear_marks_by_group(group); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 8445fbc8985c..dbafbfc8ceca 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -293,10 +293,8 @@ static int inotify_release(struct inode *ignored, struct file *file) pr_debug("%s: group=%p\n", __func__, group); - fsnotify_clear_marks_by_group(group); - /* free this group, matching get was inotify_init->fsnotify_obtain_group */ - fsnotify_put_group(group); + fsnotify_destroy_group(group); return 0; } @@ -712,7 +710,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > inotify_max_user_instances) { - fsnotify_put_group(group); + fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } @@ -741,7 +739,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) ret = anon_inode_getfd("inotify", &inotify_fops, group, O_RDONLY | flags); if (ret < 0) - fsnotify_put_group(group); + fsnotify_destroy_group(group); return ret; } -- cgit v1.2.1 From 986129520479d689962a42c31acdeaf854ac91f5 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:46 +0200 Subject: fsnotify: introduce fsnotify_get_group() Introduce fsnotify_get_group() which increments the reference counter of a group. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/group.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/notify/group.c b/fs/notify/group.c index cfda328c3d11..1d57c35f1043 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -62,6 +62,14 @@ void fsnotify_destroy_group(struct fsnotify_group *group) fsnotify_final_destroy_group(group); } +/* + * Get reference to a group. + */ +void fsnotify_get_group(struct fsnotify_group *group) +{ + atomic_inc(&group->refcnt); +} + /* * Drop a reference to a group. Free it if it's through. */ -- cgit v1.2.1 From 23e964c284ca0a767b80a30482bd53b059d30391 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:47 +0200 Subject: fsnotify: use reference counting for groups Get a group ref for each mark that is added to the groups list and release that ref when the mark is freed in fsnotify_put_mark(). We also use get a group reference for duplicated marks and for private event data. Now we dont free a group any more when the number of marks becomes 0 but when the groups ref count does. Since this will only happen when all marks are removed from a groups mark list, we dont have to set the groups number of marks to 1 at group creation. Beside clearing all marks in fsnotify_destroy_group() we do also flush the groups event queue. This is since events may hold references to groups (due to private event data) and we have to put those references first before we get a chance to put the final ref, which will result in a call to fsnotify_final_destroy_group(). Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/group.c | 28 ++++++++++------------------ fs/notify/inotify/inotify_fsnotify.c | 2 ++ fs/notify/inotify/inotify_user.c | 1 + fs/notify/mark.c | 24 ++++++++++++++---------- 4 files changed, 27 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/notify/group.c b/fs/notify/group.c index 1d57c35f1043..354044c47e23 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -33,9 +33,6 @@ */ void fsnotify_final_destroy_group(struct fsnotify_group *group) { - /* clear the notification queue of all events */ - fsnotify_flush_notify(group); - if (group->ops->free_group_priv) group->ops->free_group_priv(group); @@ -43,12 +40,10 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group) } /* - * Trying to get rid of a group. We need to first get rid of any outstanding - * allocations and then free the group. Remember that fsnotify_clear_marks_by_group - * could miss marks that are being freed by inode and those marks could still - * hold a reference to this group (via group->num_marks) If we get into that - * situtation, the fsnotify_final_destroy_group will get called when that final - * mark is freed. + * Trying to get rid of a group. Remove all marks, flush all events and release + * the group reference. + * Note that another thread calling fsnotify_clear_marks_by_group() may still + * hold a ref to the group. */ void fsnotify_destroy_group(struct fsnotify_group *group) { @@ -57,9 +52,10 @@ void fsnotify_destroy_group(struct fsnotify_group *group) synchronize_srcu(&fsnotify_mark_srcu); - /* past the point of no return, matches the initial value of 1 */ - if (atomic_dec_and_test(&group->num_marks)) - fsnotify_final_destroy_group(group); + /* clear the notification queue of all events */ + fsnotify_flush_notify(group); + + fsnotify_put_group(group); } /* @@ -76,7 +72,7 @@ void fsnotify_get_group(struct fsnotify_group *group) void fsnotify_put_group(struct fsnotify_group *group) { if (atomic_dec_and_test(&group->refcnt)) - fsnotify_destroy_group(group); + fsnotify_final_destroy_group(group); } /* @@ -92,11 +88,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) /* set to 0 when there a no external references to this group */ atomic_set(&group->refcnt, 1); - /* - * hits 0 when there are no external references AND no marks for - * this group - */ - atomic_set(&group->num_marks, 1); + atomic_set(&group->num_marks, 0); mutex_init(&group->notification_mutex); INIT_LIST_HEAD(&group->notification_list); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index e3cbd746f64a..74977fbf5aae 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -118,6 +118,7 @@ static int inotify_handle_event(struct fsnotify_group *group, fsn_event_priv = &event_priv->fsnotify_event_priv_data; + fsnotify_get_group(group); fsn_event_priv->group = group; event_priv->wd = wd; @@ -210,6 +211,7 @@ void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, fsnotify_event_priv_data); + fsnotify_put_group(fsn_event_priv->group); kmem_cache_free(event_priv_cachep, event_priv); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index dbafbfc8ceca..246250f1db7a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -531,6 +531,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, fsn_event_priv = &event_priv->fsnotify_event_priv_data; + fsnotify_get_group(group); fsn_event_priv->group = group; event_priv->wd = i_mark->wd; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index f104d565b682..3c7a1699df3d 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -109,8 +109,11 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) void fsnotify_put_mark(struct fsnotify_mark *mark) { - if (atomic_dec_and_test(&mark->refcnt)) + if (atomic_dec_and_test(&mark->refcnt)) { + if (mark->group) + fsnotify_put_group(mark->group); mark->free_mark(mark); + } } /* @@ -125,12 +128,13 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) spin_lock(&mark->lock); + fsnotify_get_group(mark->group); group = mark->group; /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return; + goto put_group; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; @@ -177,19 +181,15 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) iput(inode); - /* * We don't necessarily have a ref on mark from caller so the above iput * may have already destroyed it. Don't touch from now on. */ - /* - * it's possible that this group tried to destroy itself, but this - * this mark was simultaneously being freed by inode. If that's the - * case, we finish freeing the group here. - */ - if (unlikely(atomic_dec_and_test(&group->num_marks))) - fsnotify_final_destroy_group(group); + atomic_dec(&group->num_marks); + +put_group: + fsnotify_put_group(group); } void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) @@ -234,6 +234,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + fsnotify_get_group(group); mark->group = group; list_add(&mark->g_list, &group->marks_list); atomic_inc(&group->num_marks); @@ -265,6 +266,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, err: mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; list_del_init(&mark->g_list); + fsnotify_put_group(group); mark->group = NULL; atomic_dec(&group->num_marks); @@ -317,6 +319,8 @@ void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *ol assert_spin_locked(&old->lock); new->i.inode = old->i.inode; new->m.mnt = old->m.mnt; + if (old->group) + fsnotify_get_group(old->group); new->group = old->group; new->mask = old->mask; new->free_mark = old->free_mark; -- cgit v1.2.1 From 104d06f08ea59247cb0e7e548c5a5d22d21dcfd5 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:48 +0200 Subject: fsnotify: take groups mark_lock before mark lock Race-free addition and removal of a mark to a groups mark list would be easier if we could lock the mark list of group before we lock the specific mark. This patch changes the order used to add/remove marks to/from mark lists from 1. mark->lock 2. group->mark_lock 3. inode->i_lock to 1. group->mark_lock 2. mark->lock 3. inode->i_lock Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/mark.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 3c7a1699df3d..32447dc06c07 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -127,20 +127,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) struct inode *inode = NULL; spin_lock(&mark->lock); - + /* dont get the group from a mark that is not alive yet */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } fsnotify_get_group(mark->group); group = mark->group; + spin_unlock(&mark->lock); + + spin_lock(&group->mark_lock); + spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); + spin_unlock(&group->mark_lock); goto put_group; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; - spin_lock(&group->mark_lock); - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { inode = mark->i.inode; fsnotify_destroy_inode_mark(mark); @@ -151,8 +158,8 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) list_del_init(&mark->g_list); - spin_unlock(&group->mark_lock); spin_unlock(&mark->lock); + spin_unlock(&group->mark_lock); spin_lock(&destroy_lock); list_add(&mark->destroy_list, &destroy_list); @@ -225,13 +232,13 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, /* * LOCKING ORDER!!!! - * mark->lock * group->mark_lock + * mark->lock * inode->i_lock */ - spin_lock(&mark->lock); spin_lock(&group->mark_lock); + spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; fsnotify_get_group(group); @@ -252,13 +259,12 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, BUG(); } - spin_unlock(&group->mark_lock); - /* this will pin the object if appropriate */ fsnotify_set_mark_mask_locked(mark, mark->mask); - spin_unlock(&mark->lock); + spin_unlock(&group->mark_lock); + if (inode) __fsnotify_update_child_dentry_flags(inode); @@ -270,8 +276,8 @@ err: mark->group = NULL; atomic_dec(&group->num_marks); - spin_unlock(&group->mark_lock); spin_unlock(&mark->lock); + spin_unlock(&group->mark_lock); spin_lock(&destroy_lock); list_add(&mark->destroy_list, &destroy_list); -- cgit v1.2.1 From 6dfbd149946c22c2e2886d6b560def78630c8387 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:49 +0200 Subject: fanotify: add an extra flag to mark_remove_from_mask that indicates wheather a mark should be destroyed This patch adds an extra flag to mark_remove_from_mask() to inform the caller if the mark should be destroyed. With this we dont destroy the mark implicitly in the function itself any more but let the caller handle it. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 82ae6d783c14..599a01952c74 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -511,7 +511,8 @@ out: static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask, - unsigned int flags) + unsigned int flags, + int *destroy) { __u32 oldmask; @@ -525,8 +526,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, } spin_unlock(&fsn_mark->lock); - if (!(oldmask & ~mask)) - fsnotify_destroy_mark(fsn_mark); + *destroy = !(oldmask & ~mask); return mask & oldmask; } @@ -537,12 +537,17 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark = NULL; __u32 removed; + int destroy_mark; fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) return -ENOENT; - removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, + &destroy_mark); + if (destroy_mark) + fsnotify_destroy_mark(fsn_mark); + fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); @@ -556,12 +561,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark = NULL; __u32 removed; + int destroy_mark; fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) return -ENOENT; - removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, + &destroy_mark); + if (destroy_mark) + fsnotify_destroy_mark(fsn_mark); /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) -- cgit v1.2.1 From 986ab09807ca9454c3f54aae4db7e1bb00daeed3 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:50 +0200 Subject: fsnotify: use a mutex instead of a spinlock to protect a groups mark list Replaces the groups mark_lock spinlock with a mutex. Using a mutex instead of a spinlock results in more flexibility (i.e it allows to sleep while the lock is held). Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/group.c | 2 +- fs/notify/inode_mark.c | 4 ++-- fs/notify/mark.c | 18 +++++++++--------- fs/notify/vfsmount_mark.c | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/notify/group.c b/fs/notify/group.c index 354044c47e23..1f7305711fc9 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -95,7 +95,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; - spin_lock_init(&group->mark_lock); + mutex_init(&group->mark_mutex); INIT_LIST_HEAD(&group->marks_list); group->ops = ops; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index b13c00ac48eb..4e9071e37d5d 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -63,8 +63,8 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) { struct inode *inode = mark->i.inode; + BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); - assert_spin_locked(&mark->group->mark_lock); spin_lock(&inode->i_lock); @@ -191,8 +191,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, mark->flags |= FSNOTIFY_MARK_FLAG_INODE; + BUG_ON(!mutex_is_locked(&group->mark_mutex)); assert_spin_locked(&mark->lock); - assert_spin_locked(&group->mark_lock); spin_lock(&inode->i_lock); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 32447dc06c07..ab25b810b146 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -136,13 +136,13 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) group = mark->group; spin_unlock(&mark->lock); - spin_lock(&group->mark_lock); + mutex_lock(&group->mark_mutex); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - spin_unlock(&group->mark_lock); + mutex_unlock(&group->mark_mutex); goto put_group; } @@ -159,7 +159,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) list_del_init(&mark->g_list); spin_unlock(&mark->lock); - spin_unlock(&group->mark_lock); + mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); list_add(&mark->destroy_list, &destroy_list); @@ -232,11 +232,11 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, /* * LOCKING ORDER!!!! - * group->mark_lock + * group->mark_mutex * mark->lock * inode->i_lock */ - spin_lock(&group->mark_lock); + mutex_lock(&group->mark_mutex); spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; @@ -263,7 +263,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_set_mark_mask_locked(mark, mark->mask); spin_unlock(&mark->lock); - spin_unlock(&group->mark_lock); + mutex_unlock(&group->mark_mutex); if (inode) __fsnotify_update_child_dentry_flags(inode); @@ -277,7 +277,7 @@ err: atomic_dec(&group->num_marks); spin_unlock(&mark->lock); - spin_unlock(&group->mark_lock); + mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); list_add(&mark->destroy_list, &destroy_list); @@ -296,7 +296,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, struct fsnotify_mark *lmark, *mark; LIST_HEAD(free_list); - spin_lock(&group->mark_lock); + mutex_lock(&group->mark_mutex); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->flags & flags) { list_add(&mark->free_g_list, &free_list); @@ -304,7 +304,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, fsnotify_get_mark(mark); } } - spin_unlock(&group->mark_lock); + mutex_unlock(&group->mark_mutex); list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) { fsnotify_destroy_mark(mark); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index b7b4b0e8554f..f26a348827f8 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -88,8 +88,8 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) { struct vfsmount *mnt = mark->m.mnt; + BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); - assert_spin_locked(&mark->group->mark_lock); spin_lock(&mnt->mnt_root->d_lock); @@ -151,8 +151,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; + BUG_ON(!mutex_is_locked(&group->mark_mutex)); assert_spin_locked(&mark->lock); - assert_spin_locked(&group->mark_lock); spin_lock(&mnt->mnt_root->d_lock); -- cgit v1.2.1 From 3fed40cc97f32bebfd34a55364de9b44dcbede59 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 13 Sep 2012 04:51:36 -0600 Subject: Btrfs: cleanup duplicated division functions div_factor{_fine} has been implemented for two times, cleanup it. And I move them into a independent file named math.h because they are common math functions. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 19 +------------------ fs/btrfs/math.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 23 +---------------------- 3 files changed, 46 insertions(+), 40 deletions(-) create mode 100644 fs/btrfs/math.h (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d3e2c17d8d1..7563db782abf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "volumes.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" #undef SCRAMBLE_DELAYED_REFS @@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 000000000000..b7816cefbd13 --- /dev/null +++ b/fs/btrfs/math.h @@ -0,0 +1,44 @@ + +/* + * Copyright (C) 2012 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f5ebb72a5ea..a8adf2686473 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -36,6 +35,7 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" +#include "math.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2338,18 +2338,6 @@ static int chunk_profiles_filter(u64 chunk_type, return 1; } -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor <= 0) - return 0; - if (factor >= 100) - return num; - - num *= factor; - do_div(num, 100); - return num; -} - static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { @@ -2514,15 +2502,6 @@ static int should_balance_chunk(struct btrfs_root *root, return 1; } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; -- cgit v1.2.1 From 561c294d4cfb30c4acfa0a243448fc55af730d87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:32:18 +0000 Subject: Btrfs: fix wrong comment in can_overcommit() The comment is not coincident with the code. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7563db782abf..2cfcce290aba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3668,9 +3668,9 @@ static int can_overcommit(struct btrfs_root *root, avail >>= 1; /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. */ if (flush) avail >>= 3; -- cgit v1.2.1 From 08e007d2e57744472a9424735a368ffe6d625597 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:33:38 +0000 Subject: Btrfs: improve the noflush reservation In some places(such as: evicting inode), we just can not flush the reserved space of delalloc, flushing the delayed directory index and delayed inode is OK, but we don't try to flush those things and just go back when there is no enough space to be reserved. This patch fixes this problem. We defined 3 types of the flush operations: NO_FLUSH, FLUSH_LIMIT and FLUSH_ALL. If we can in the transaction, we should not flush anything, or the deadlock would happen, so use NO_FLUSH. If we flushing the reserved space of delalloc would cause deadlock, use FLUSH_LIMIT. In the other cases, FLUSH_ALL is used, and we will flush all things. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 26 ++++++++----- fs/btrfs/delayed-inode.c | 6 ++- fs/btrfs/extent-tree.c | 97 ++++++++++++++++++++++++------------------------ fs/btrfs/inode-map.c | 5 ++- fs/btrfs/inode.c | 5 ++- fs/btrfs/relocation.c | 12 ++++-- fs/btrfs/transaction.c | 30 +++++++-------- fs/btrfs/transaction.h | 2 +- 8 files changed, 97 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c72ead869507..8fd9fe4282f5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2900,6 +2900,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2919,19 +2931,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 478f66bdc57b..0c6dca550ea1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); /* * Since we're under a transaction reserve_metadata_bytes could * try to commit the transaction which will make it return @@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata( * reserve something strictly for us. If not be a pain and try * to steal from the delalloc block rsv. */ - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (!ret) goto out; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2cfcce290aba..2136adda2a0f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3644,7 +3644,7 @@ out: static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, - int flush) + enum btrfs_reserve_flush_enum flush) { u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; @@ -3672,7 +3672,7 @@ static int can_overcommit(struct btrfs_root *root, * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ - if (flush) + if (flush == BTRFS_RESERVE_FLUSH_ALL) avail >>= 3; else avail >>= 1; @@ -3696,6 +3696,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + enum btrfs_reserve_flush_enum flush; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -3723,8 +3724,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, wait_event(root->fs_info->async_submit_wait, !atomic_read(&root->fs_info->async_delalloc_pages)); + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, !trans)) { + if (can_overcommit(root, space_info, orig, flush)) { spin_unlock(&space_info->lock); break; } @@ -3882,7 +3887,8 @@ static int flush_space(struct btrfs_root *root, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; @@ -3895,10 +3901,11 @@ again: ret = 0; spin_lock(&space_info->lock); /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - while (flush && !flushing && space_info->flush) { + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { spin_unlock(&space_info->lock); /* * If we have a trans handle we can't wait because the flusher @@ -3964,23 +3971,40 @@ again: * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. */ - if (ret && flush) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { flushing = true; space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret || !flush) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; ret = flush_space(root, space_info, num_bytes, orig_bytes, flush_state); flush_state++; + + /* + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. + */ + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; + if (!ret) goto again; - else if (flush_state <= COMMIT_TRANS) + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) goto again; out: @@ -4131,9 +4155,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) { int ret; @@ -4149,20 +4173,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { @@ -4181,9 +4191,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -4211,20 +4221,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4515,14 +4511,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 csum_bytes; unsigned nr_extents = 0; int extra_reserve = 0; - int flush = 1; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(inode)) - flush = 0; + flush = BTRFS_RESERVE_NO_FLUSH; - if (flush && btrfs_transaction_in_commit(root->fs_info)) + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); mutex_lock(&BTRFS_I(inode)->delalloc_mutex); @@ -6252,7 +6249,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -6279,7 +6277,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); WARN_ON(1); } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b1a1c929ba80..d26f67a59e36 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 3 items for pre-allocation */ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, - trans->bytes_reserved); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + trans->bytes_reserved, + BTRFS_RESERVE_NO_FLUSH); if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95542a1b3dfc..db3dd4ed057f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3829,7 +3829,8 @@ void btrfs_evict_inode(struct inode *inode) * inode item when doing the truncate. */ while (1) { - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); /* * Try and steal from the global reserve since we will @@ -3847,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_noflush(root, 1); + trans = btrfs_start_transaction_lflush(root, 1); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa128fc..242d6de4d8eb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2074,7 +2074,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2184,7 +2185,8 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; } @@ -2459,7 +2461,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -3685,7 +3688,8 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256); + rc->extent_root->nodesize * 256, + BTRFS_RESERVE_FLUSH_ALL); if (ret) return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04bbfb1052eb..4e1def4c06b1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -295,9 +295,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type) return 0; } -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type, - int noflush) +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, int type, + enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -331,14 +331,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - if (noflush) - ret = btrfs_block_rsv_add_noflush(root, - &root->fs_info->trans_block_rsv, - num_bytes); - else - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes, flush); if (ret) return ERR_PTR(ret); } @@ -422,13 +417,15 @@ got_it: struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 0); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL); } -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 1); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_LIMIT); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) @@ -1032,8 +1029,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add(root, &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); if (ret) { pending->error = ret; goto no_free_objectid; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 80961947a6b2..0e8aa1e6c287 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); -- cgit v1.2.1 From de1ee92ac3bce4c9d760016c4d6198158e6e2f15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Oct 2012 16:50:56 -0400 Subject: Btrfs: recheck bio against block device when we map the bio Alex reported a problem where we were writing between chunks on a rbd device. The thing is we do bio_add_page using logical offsets, but the physical offset may be different. So when we map the bio now check to see if the bio is still ok with the physical offset, and if it is not split the bio up and redo the bio_add_page with the physical sector. This fixes the problem for Alex and doesn't affect performance in the normal case. Thanks, Reported-and-tested-by: Alex Elder Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 159 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 131 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8adf2686473..eaaf0bf52791 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root, &device->work); } +static int bio_size_ok(struct block_device *bdev, struct bio *bio, + sector_t sector) +{ + struct bio_vec *prev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned short max_sectors = queue_max_sectors(q); + struct bvec_merge_data bvm = { + .bi_bdev = bdev, + .bi_sector = sector, + .bi_rw = bio->bi_rw, + }; + + if (bio->bi_vcnt == 0) { + WARN_ON(1); + return 1; + } + + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if ((bio->bi_size >> 9) > max_sectors) + return 0; + + if (!q->merge_bvec_fn) + return 1; + + bvm.bi_size = bio->bi_size - prev->bv_len; + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) + return 0; + return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *bio, u64 physical, int dev_nr, + int rw, int async) +{ + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + + bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = physical >> 9; +#ifdef DEBUG + { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); + } +#endif + bio->bi_bdev = dev->bdev; + if (async) + schedule_bio(root, dev, rw, bio); + else + btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *first_bio, struct btrfs_device *dev, + int dev_nr, int rw, int async) +{ + struct bio_vec *bvec = first_bio->bi_io_vec; + struct bio *bio; + int nr_vecs = bio_get_nr_vecs(dev->bdev); + u64 physical = bbio->stripes[dev_nr].physical; + +again: + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); + if (!bio) + return -ENOMEM; + + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + u64 len = bio->bi_size; + + atomic_inc(&bbio->stripes_pending); + submit_stripe_bio(root, bbio, bio, physical, dev_nr, + rw, async); + physical += len; + goto again; + } + bvec++; + } + + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); + return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ + atomic_inc(&bbio->error); + if (atomic_dec_and_test(&bbio->stripes_pending)) { + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; + bio->bi_sector = logical >> 9; + kfree(bbio); + bio_endio(bio, -EIO); + } +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { @@ -4255,40 +4362,36 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { + dev = bbio->stripes[dev_nr].dev; + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + bbio_error(bbio, first_bio, logical); + dev_nr++; + continue; + } + + /* + * Check and see if we're ok with this bio based on it's size + * and offset with the given device. + */ + if (!bio_size_ok(dev->bdev, first_bio, + bbio->stripes[dev_nr].physical >> 9)) { + ret = breakup_stripe_bio(root, bbio, first_bio, dev, + dev_nr, rw, async_submit); + BUG_ON(ret); + dev_nr++; + continue; + } + if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; } - bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); - bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; - dev = bbio->stripes[dev_nr].dev; - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { -#ifdef DEBUG - struct rcu_string *name; - - rcu_read_lock(); - name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - name->str, dev->devid, bio->bi_size); - rcu_read_unlock(); -#endif - bio->bi_bdev = dev->bdev; - if (async_submit) - schedule_bio(root, dev, rw, bio); - else - btrfsic_submit_bio(rw, bio); - } else { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - bio->bi_sector = logical >> 9; - bio_endio(bio, -EIO); - } + + submit_stripe_bio(root, bbio, bio, + bbio->stripes[dev_nr].physical, dev_nr, rw, + async_submit); dev_nr++; } return 0; -- cgit v1.2.1 From de6c4115a297d4bbf178aca9948c3539f89c9caa Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 18 Oct 2012 08:18:01 +0000 Subject: Btrfs: fix unnecessary while loop when search the free space, cache When we find a bitmap free space entry, we may check the previous extent entry covers the offset or not. But if we find this entry is also a bitmap entry, we will continue to check the previous entry of the current one by a while loop. It is unnecessary because it is impossible that the extent entry which is in front of a bitmap entry can cover the offset of the entry after that bitmap entry. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1027b854b90c..557502ca1a2a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1250,18 +1250,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - entry = prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; } } return entry; @@ -1287,18 +1282,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, } if (entry->bitmap) { - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - return prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; } if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; -- cgit v1.2.1 From 95c80bb1f6b24b57058d971ed252b2c1c5121b51 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:52 +0000 Subject: Btrfs: MOD_LOG_KEY_REMOVE_WHILE_MOVING never change node's nritems Key MOD_LOG_KEY_REMOVE_WHILE_MOVING means that we're doing memmove inside an extent buffer node, and the node's number of items remains unchanged (unless we are inserting a single pointer, but we have MOD_LOG_KEY_ADD for that). So we don't need to increase node's number of items during rewinding, otherwise we may get an node larger than leafsize and cause general protection errors later. Here is the details, - If we do memory move for inserting a single pointer, we need to add node's nritems by one, and we honor MOD_LOG_KEY_ADD for adding. - If we do memory move for deleting a single pointer, we need to decrease node's nritems by one, and we honor MOD_LOG_KEY_REMOVE for deleting. - If we do memory move for balance left/right, we need to decrease node's nritems, and we honor MOD_LOG_KEY_REMOVE for balaning. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdfb4c49a806..b12c03959162 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1140,13 +1140,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: + n++; + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: btrfs_set_node_key(eb, &tm->key, tm->slot); btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); - n++; break; case MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); -- cgit v1.2.1 From 6a7a665d78c5dd8bc76a010648c4e7d84517ab5a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:53 +0000 Subject: Btrfs: reorder tree mod log operations in deleting a pointer Since we don't use MOD_LOG_KEY_REMOVE_WHILE_MOVING to add nritems during rewinding, we should insert a MOD_LOG_KEY_REMOVE operation first. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b12c03959162..4d518bd7751d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4609,6 +4609,12 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; int ret; + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } + nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (tree_mod_log && level) @@ -4619,10 +4625,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); - } else if (tree_mod_log && level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); } nritems--; -- cgit v1.2.1 From 0e411ecec60138f22442728f036d38cfea007817 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:54 +0000 Subject: Btrfs: kill unnecessary arguments in del_ptr The argument 'tree_mod_log' is not necessary since all of callers enable it. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4d518bd7751d..615b74968fab 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log); + struct btrfs_path *path, int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -1827,7 +1826,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1, 1); + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1871,7 +1870,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot, 1); + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -4602,14 +4601,13 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log) + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret; - if (tree_mod_log && level) { + if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); @@ -4617,7 +4615,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (tree_mod_log && level) + if (level) tree_mod_log_eb_move(root->fs_info, parent, slot, slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, @@ -4658,7 +4656,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1], 1); + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we -- cgit v1.2.1 From 32adf0901371c8b9d258dba7811f3067d1d2ea5c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 12:52:15 +0000 Subject: Btrfs: cleanup unused arguments 'disk_key' is not used at all. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 615b74968fab..100c274a1cfe 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -775,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int slot, int atomic) + struct extent_buffer *eb, int slot, int atomic) { int ret; @@ -1835,7 +1834,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &right_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1879,7 +1878,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + tree_mod_log_set_node_key(root->fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1979,7 +1978,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot, 0); + pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -2032,7 +2031,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2916,7 +2915,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) -- cgit v1.2.1 From 7b398f8e58c415738e397645c926253c428cf002 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:52:28 -0400 Subject: Btrfs: fill the global reserve when unpinning space Dave gave me an image of a very full file system that would abort the transaction because it ran out of space while committing the transaction. This is because we would think there was plenty of room to create a snapshot even though the global reserve was not full. This happens because we calculate the global reserve size before we unpin any space, so after we unpin the space we allow reservations to occur even though we haven't reserved all of the space for our global reserve. Fix this by adding to the global reserve while unpinning in order to make sure we always have enough space to do our work. With this patch we no longer end up with an aborted transaction, we return ENOSPC properly to the person trying to create the snapshot. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2136adda2a0f..b495cb4b9b2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4949,9 +4949,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) @@ -4969,15 +4973,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } start += len; + space_info = cache->space_info; - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; + space_info->bytes_pinned -= len; + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) -- cgit v1.2.1 From 8ccf6f19b67f7e0921063cc309f4672a6afcb528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:28:04 +0000 Subject: Btrfs: make delalloc inodes be flushed by multi-task This patch introduce a new worker pool named "flush_workers", and if we want to force all the inode with pending delalloc to the disks, we can queue those inodes into the work queue of the worker pool, in this way, those inodes will be flushed by multi-task. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 +++++++++ fs/btrfs/disk-io.c | 7 +++++ fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/relocation.c | 6 +++- fs/btrfs/transaction.c | 6 +++- 5 files changed, 103 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd9fe4282f5..cad16566da37 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1333,6 +1333,7 @@ struct btrfs_fs_info { struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; + struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; @@ -3277,6 +3278,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1e..bd70c2852ba0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2279,6 +2279,10 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), @@ -2350,6 +2354,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->flush_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2667,6 +2672,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->flush_workers); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3339,6 +3345,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db3dd4ed057f..dce9e218b845 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; @@ -7204,6 +7205,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) @@ -7238,6 +7241,13 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -7448,6 +7458,49 @@ out_notrans: return ret; } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + if (delalloc_work->wait) + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); + else + filemap_flush(delalloc_work->inode->i_mapping); + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(delalloc_work->inode); + else + iput(delalloc_work->inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + work->work.func = btrfs_run_delalloc_work; + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. @@ -7457,10 +7510,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + INIT_LIST_HEAD(&works); + spin_lock(&root->fs_info->delalloc_lock); while (!list_empty(head)) { binode = list_entry(head->next, struct btrfs_inode, @@ -7470,11 +7528,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - filemap_flush(inode->i_mapping); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (!work) { + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -7493,7 +7554,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + return ret; } static int btrfs_symlink(struct inode *dir, struct dentry *dentry, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 242d6de4d8eb..270f24ffe1be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4061,7 +4061,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + if (ret < 0) { + err = ret; + goto out; + } btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4e1def4c06b1..9c466f9f8175 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1497,7 +1497,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } btrfs_wait_ordered_extents(root, 1); } -- cgit v1.2.1 From 25287e0a16c0ad068aa89ab01aea6c699b31ec12 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:31:03 +0000 Subject: Btrfs: make ordered operations be handled by multi-task The process of the ordered operations is similar to the delalloc inode flush, so we handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 46 ++++++++++++++++++++++++++++++---------------- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/transaction.c | 18 ++++++++++++++---- 3 files changed, 45 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02ba28e..ab2a3c0c540f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -519,13 +519,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + struct list_head works; + struct btrfs_delalloc_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); @@ -533,6 +537,7 @@ again: list_splice_init(&root->fs_info->ordered_operations, &splice); while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -549,15 +554,26 @@ again: list_add_tail(&BTRFS_I(inode)->ordered_operations, &root->fs_info->ordered_operations); } + + if (!inode) + continue; spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + work = btrfs_alloc_delalloc_work(inode, wait, 1); + if (!work) { + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_tail(&splice, + &root->fs_info->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); @@ -566,7 +582,13 @@ again: goto again; spin_unlock(&root->fs_info->ordered_extent_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } mutex_unlock(&root->fs_info->ordered_operations_mutex); + return ret; } /* @@ -934,15 +956,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -959,6 +972,7 @@ int __init ordered_data_init(void) NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; + return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b46a37..e8dcec635112 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c466f9f8175..259f74eabdb8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1412,15 +1412,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret = -EIO; + int ret; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } - if (cur_trans->aborted) + if (cur_trans->aborted) { + ret = cur_trans->aborted; goto cleanup_transaction; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1523,7 +1529,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); -- cgit v1.2.1 From 9afab8820bb8b55af669b199597d6716e04d1ba8 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:41:36 +0000 Subject: Btrfs: make ordered extent be flushed by multi-task Though the process of the ordered extents is a bit different with the delalloc inode flush, but we can see it as a subset of the delalloc inode flush, so we also handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 41 +++++++++++++++++++++++++++++++++-------- fs/btrfs/ordered-data.h | 5 ++++- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ab2a3c0c540f..eecc20f14cfa 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); trace_btrfs_ordered_extent_add(inode, entry); @@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode, wake_up(&entry->wait); } +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); +} + /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { - struct list_head splice; + struct list_head splice, works; struct list_head *cur; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered, *next; struct inode *inode; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); @@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->ordered_extent_lock); if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); } else { btrfs_put_ordered_extent(ordered); } + cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + + inode = ordered->inode; + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + + cond_resched(); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e8dcec635112..efc7c2930c17 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -128,8 +128,11 @@ struct btrfs_ordered_extent { struct list_head root_extent_list; struct btrfs_work work; -}; + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; /* * calculates the total size you need to allocate for an ordered sum -- cgit v1.2.1 From e2a29943e9a2ee2aa737a77f550f46ba72269db4 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:51 +0200 Subject: fsnotify: pass group to fsnotify_destroy_mark() In fsnotify_destroy_mark() dont get the group from the passed mark anymore, but pass the group itself as an additional parameter to the function. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 4 ++-- fs/notify/fanotify/fanotify_user.c | 4 ++-- fs/notify/inode_mark.c | 10 +++++++++- fs/notify/inotify/inotify_fsnotify.c | 2 +- fs/notify/inotify/inotify_user.c | 2 +- fs/notify/mark.c | 21 ++++----------------- fs/notify/vfsmount_mark.c | 10 +++++++++- 7 files changed, 28 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 3344bdd5506e..08b886f119ce 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -201,7 +201,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) /* nothing else could have found us thanks to the dnotify_mark_mutex */ if (dn_mark->dn == NULL) - fsnotify_destroy_mark(fsn_mark); + fsnotify_destroy_mark(fsn_mark, dnotify_group); mutex_unlock(&dnotify_mark_mutex); @@ -385,7 +385,7 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark(fsn_mark); + fsnotify_destroy_mark(fsn_mark, dnotify_group); mutex_unlock(&dnotify_mark_mutex); fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 599a01952c74..1218d10424d0 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -546,7 +546,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark); + fsnotify_destroy_mark(fsn_mark, group); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -570,7 +570,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark); + fsnotify_destroy_mark(fsn_mark, group); /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4e9071e37d5d..21230209c957 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -99,8 +99,16 @@ void fsnotify_clear_marks_by_inode(struct inode *inode) spin_unlock(&inode->i_lock); list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { - fsnotify_destroy_mark(mark); + struct fsnotify_group *group; + + spin_lock(&mark->lock); + fsnotify_get_group(mark->group); + group = mark->group; + spin_unlock(&mark->lock); + + fsnotify_destroy_mark(mark, group); fsnotify_put_mark(mark); + fsnotify_put_group(group); } } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 74977fbf5aae..871569c7d609 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -132,7 +132,7 @@ static int inotify_handle_event(struct fsnotify_group *group, } if (inode_mark->mask & IN_ONESHOT) - fsnotify_destroy_mark(inode_mark); + fsnotify_destroy_mark(inode_mark, group); return ret; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 246250f1db7a..00ff82ff7c9f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -816,7 +816,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) ret = 0; - fsnotify_destroy_mark(&i_mark->fsn_mark); + fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index ab25b810b146..b77c833c8d0a 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -121,21 +121,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * The caller had better be holding a reference to this mark so we don't actually * do the final put under the mark->lock */ -void fsnotify_destroy_mark(struct fsnotify_mark *mark) +void fsnotify_destroy_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) { - struct fsnotify_group *group; struct inode *inode = NULL; - spin_lock(&mark->lock); - /* dont get the group from a mark that is not alive yet */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { - spin_unlock(&mark->lock); - return; - } - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); - mutex_lock(&group->mark_mutex); spin_lock(&mark->lock); @@ -143,7 +133,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); mutex_unlock(&group->mark_mutex); - goto put_group; + return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; @@ -194,9 +184,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) */ atomic_dec(&group->num_marks); - -put_group: - fsnotify_put_group(group); } void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) @@ -307,7 +294,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, mutex_unlock(&group->mark_mutex); list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) { - fsnotify_destroy_mark(mark); + fsnotify_destroy_mark(mark, group); fsnotify_put_mark(mark); } } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index f26a348827f8..4df58b8ea64a 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -46,8 +46,16 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) spin_unlock(&mnt->mnt_root->d_lock); list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { - fsnotify_destroy_mark(mark); + struct fsnotify_group *group; + + spin_lock(&mark->lock); + fsnotify_get_group(mark->group); + group = mark->group; + spin_unlock(&mark->lock); + + fsnotify_destroy_mark(mark, group); fsnotify_put_mark(mark); + fsnotify_put_group(group); } } -- cgit v1.2.1 From d5a335b845792d2a69ed1e244c0b233117b7db3c Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:52 +0200 Subject: fsnotify: introduce locked versions of fsnotify_add_mark() and fsnotify_remove_mark() This patch introduces fsnotify_add_mark_locked() and fsnotify_remove_mark_locked() which are essentially the same as fsnotify_add_mark() and fsnotify_remove_mark() but assume that the caller has already taken the groups mark mutex. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/mark.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index b77c833c8d0a..f9dda0304a10 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -121,18 +121,18 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * The caller had better be holding a reference to this mark so we don't actually * do the final put under the mark->lock */ -void fsnotify_destroy_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group) +void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, + struct fsnotify_group *group) { struct inode *inode = NULL; - mutex_lock(&group->mark_mutex); + BUG_ON(!mutex_is_locked(&group->mark_mutex)); + spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - mutex_unlock(&group->mark_mutex); return; } @@ -149,6 +149,8 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, list_del_init(&mark->g_list); spin_unlock(&mark->lock); + + /* release lock temporarily */ mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); @@ -184,6 +186,16 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, */ atomic_dec(&group->num_marks); + + mutex_lock(&group->mark_mutex); +} + +void fsnotify_destroy_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) +{ + mutex_lock(&group->mark_mutex); + fsnotify_destroy_mark_locked(mark, group); + mutex_unlock(&group->mark_mutex); } void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) @@ -208,14 +220,15 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ -int fsnotify_add_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - struct vfsmount *mnt, int allow_dups) +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + struct vfsmount *mnt, int allow_dups) { int ret = 0; BUG_ON(inode && mnt); BUG_ON(!inode && !mnt); + BUG_ON(!mutex_is_locked(&group->mark_mutex)); /* * LOCKING ORDER!!!! @@ -223,8 +236,6 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, * mark->lock * inode->i_lock */ - mutex_lock(&group->mark_mutex); - spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; @@ -250,8 +261,6 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_set_mark_mask_locked(mark, mark->mask); spin_unlock(&mark->lock); - mutex_unlock(&group->mark_mutex); - if (inode) __fsnotify_update_child_dentry_flags(inode); @@ -264,7 +273,6 @@ err: atomic_dec(&group->num_marks); spin_unlock(&mark->lock); - mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); list_add(&mark->destroy_list, &destroy_list); @@ -274,6 +282,16 @@ err: return ret; } +int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, + struct inode *inode, struct vfsmount *mnt, int allow_dups) +{ + int ret; + mutex_lock(&group->mark_mutex); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); + mutex_unlock(&group->mark_mutex); + return ret; +} + /* * clear any marks in a group in which mark->flags & flags is true */ -- cgit v1.2.1 From 64c20d2a20fce295c260ea6cb3b468edfa2fb07b Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:53 +0200 Subject: fsnotify: dont put marks on temporary list when clearing marks by group In clear_marks_by_group_flags() the mark list of a group is iterated and the marks are put on a temporary list. Since we introduced fsnotify_destroy_mark_locked() we dont need the temp list any more and are able to remove the marks while the mark list is iterated and the mark list mutex is held. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/mark.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index f9dda0304a10..0e93d90bb753 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -299,22 +299,16 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags) { struct fsnotify_mark *lmark, *mark; - LIST_HEAD(free_list); mutex_lock(&group->mark_mutex); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->flags & flags) { - list_add(&mark->free_g_list, &free_list); - list_del_init(&mark->g_list); fsnotify_get_mark(mark); + fsnotify_destroy_mark_locked(mark, group); + fsnotify_put_mark(mark); } } mutex_unlock(&group->mark_mutex); - - list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) { - fsnotify_destroy_mark(mark, group); - fsnotify_put_mark(mark); - } } /* -- cgit v1.2.1 From 6960b0d909cde5bdff49e4e5c1250edd10be7ebd Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 12 Aug 2011 01:13:31 +0200 Subject: fsnotify: change locking order On Mon, Aug 01, 2011 at 04:38:22PM -0400, Eric Paris wrote: > > I finally built and tested a v3.0 kernel with these patches (I know I'm > SOOOOOO far behind). Not what I hoped for: > > > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day... > > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070 > > [ 150.946012] IP: [] shmem_free_inode+0x18/0x50 > > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0 > > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > > [ 150.946012] CPU 0 > > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan] > > [ 150.946012] > > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM > > [ 150.946012] RIP: 0010:[] [] shmem_free_inode+0x18/0x50 > > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282 > > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438 > > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240 > > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000 > > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428 > > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610 > > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000 > > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0 > > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760) > > [ 150.946012] Stack: > > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76 > > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250 > > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438 > > [ 150.946012] Call Trace: > > [ 150.946012] [] shmem_evict_inode+0x76/0x130 > > [ 150.946012] [] evict+0x7e/0x170 > > [ 150.946012] [] iput_final+0xd0/0x190 > > [ 150.946012] [] iput+0x33/0x40 > > [ 150.946012] [] fsnotify_destroy_mark_locked+0x145/0x160 > > [ 150.946012] [] fsnotify_destroy_mark+0x36/0x50 > > [ 150.946012] [] sys_inotify_rm_watch+0x77/0xd0 > > [ 150.946012] [] system_call_fastpath+0x16/0x1b > > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00 > > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a > > [ 150.946012] RIP [] shmem_free_inode+0x18/0x50 > > [ 150.946012] RSP > > [ 150.946012] CR2: 0000000000000070 > > Looks at aweful lot like the problem from: > http://www.spinics.net/lists/linux-fsdevel/msg46101.html > I tried to reproduce this bug with your test program, but without success. However, if I understand correctly, this occurs since we dont hold any locks when we call iput() in mark_destroy(), right? With the patches you tested, iput() is also not called within any lock, since the groups mark_mutex is released temporarily before iput() is called. This is, since the original codes behaviour is similar. However since we now have a mutex as the biggest lock, we can do what you suggested (http://www.spinics.net/lists/linux-fsdevel/msg46107.html) and call iput() with the mutex held to avoid the race. The patch below implements this. It uses nested locking to avoid deadlock in case we do the final iput() on an inode which still holds marks and thus would take the mutex again when calling fsnotify_inode_delete() in destroy_inode(). Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/mark.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 0e93d90bb753..fc6b49bf7360 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -150,6 +150,8 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, spin_unlock(&mark->lock); + if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) + iput(inode); /* release lock temporarily */ mutex_unlock(&group->mark_mutex); @@ -157,6 +159,11 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, list_add(&mark->destroy_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); + /* + * We don't necessarily have a ref on mark from caller so the above destroy + * may have actually freed it, unless this group provides a 'freeing_mark' + * function which must be holding a reference. + */ /* * Some groups like to know that marks are being freed. This is a @@ -178,22 +185,15 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, * is just a lazy update (and could be a perf win...) */ - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) - iput(inode); - /* - * We don't necessarily have a ref on mark from caller so the above iput - * may have already destroyed it. Don't touch from now on. - */ - atomic_dec(&group->num_marks); - mutex_lock(&group->mark_mutex); + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { - mutex_lock(&group->mark_mutex); + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); fsnotify_destroy_mark_locked(mark, group); mutex_unlock(&group->mark_mutex); } @@ -300,7 +300,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, { struct fsnotify_mark *lmark, *mark; - mutex_lock(&group->mark_mutex); + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->flags & flags) { fsnotify_get_mark(mark); -- cgit v1.2.1 From 0a6b6bd5919a65030b557ec8fe81f6fb3e93744a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 14 Oct 2011 17:43:39 -0400 Subject: fsnotify: make fasync generic for both inotify and fanotify inotify is supposed to support async signal notification when information is available on the inotify fd. This patch moves that support to generic fsnotify functions so it can be used by all notification mechanisms. Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 4 ++++ fs/notify/group.c | 7 +++++++ fs/notify/inotify/inotify_user.c | 13 ++++--------- fs/notify/notification.c | 1 + 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1218d10424d0..f0e7a57bc899 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -414,6 +414,10 @@ static int fanotify_release(struct inode *ignored, struct file *file) wake_up(&group->fanotify_data.access_waitq); #endif + + if (file->f_flags & FASYNC) + fsnotify_fasync(-1, file, 0); + /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); diff --git a/fs/notify/group.c b/fs/notify/group.c index 1f7305711fc9..bd2625bd88b4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -102,3 +102,10 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } + +int fsnotify_fasync(int fd, struct file *file, int on) +{ + struct fsnotify_group *group = file->private_data; + + return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO; +} diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 00ff82ff7c9f..68f7bec1e664 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -280,19 +280,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf, return ret; } -static int inotify_fasync(int fd, struct file *file, int on) -{ - struct fsnotify_group *group = file->private_data; - - return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO; -} - static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); + if (file->f_flags & FASYNC) + fsnotify_fasync(-1, file, 0); + /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); @@ -335,7 +331,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, static const struct file_operations inotify_fops = { .poll = inotify_poll, .read = inotify_read, - .fasync = inotify_fasync, + .fasync = fsnotify_fasync, .release = inotify_release, .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, @@ -706,7 +702,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); group->inotify_data.last_wd = 0; - group->inotify_data.fa = NULL; group->inotify_data.user = get_current_user(); if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > diff --git a/fs/notify/notification.c b/fs/notify/notification.c index c887b1378f7e..b3963d8c9988 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -225,6 +225,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); + kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); return return_event; } -- cgit v1.2.1 From 03a1cec1f17ac1a6041996b3e40f96b5a2f90e1b Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 23 Mar 2012 02:42:23 +0100 Subject: fanotify: dont merge permission events Boyd Yang reported a problem for the case that multiple threads of the same thread group are waiting for a reponse for a permission event. In this case it is possible that some of the threads are never woken up, even if the response for the event has been received (see http://marc.info/?l=linux-kernel&m=131822913806350&w=2). The reason is that we are currently merging permission events if they belong to the same thread group. But we are not prepared to wake up more than one waiter for each event. We do wait_event(group->fanotify_data.access_waitq, event->response || atomic_read(&group->fanotify_data.bypass_perm)); and after that event->response = 0; which is the reason that even if we woke up all waiters for the same event some of them may see event->response being already set 0 again, then go back to sleep and block forever. With this patch we avoid that more than one thread is waiting for a response by not merging permission events for the same thread group any more. Reported-by: Boyd Yang Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f35794b97e8e..aeb5b5abbd4f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -18,6 +18,12 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) old->tgid == new->tgid) { switch (old->data_type) { case (FSNOTIFY_EVENT_PATH): +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* dont merge two permission events */ + if ((old->mask & FAN_ALL_PERM_EVENTS) && + (new->mask & FAN_ALL_PERM_EVENTS)) + return false; +#endif if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; -- cgit v1.2.1 From 8b99c3ccf735a2294c7842d236caa42e543e2c95 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Sat, 24 Mar 2012 23:44:19 +0100 Subject: inotify: dont skip removal of watch descriptor if creation of ignored event failed In inotify_ignored_and_remove_idr() the removal of a watch descriptor is skipped if the allocation of an ignored event failed and we are leaking memory (the watch descriptor and the mark linked to it). This patch ensures that the watch descriptor is removed regardless of whether event creation failed or not. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 68f7bec1e664..a6879d169241 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -513,13 +513,13 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_event_private_data *fsn_event_priv; int ret; + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0, GFP_NOFS); if (!ignored_event) - return; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + goto skip_send_ignore; event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) @@ -541,9 +541,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, } skip_send_ignore: - /* matches the reference taken when the event was created */ - fsnotify_put_event(ignored_event); + if (ignored_event) + fsnotify_put_event(ignored_event); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); -- cgit v1.2.1 From 1ca39ab9d21ac93f94b9e3eb364ea9a5cf2aba06 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 26 Mar 2012 13:07:59 -0400 Subject: inotify: automatically restart syscalls We were mistakenly returning EINTR when we found an outstanding signal. Instead we should returen ERESTARTSYS and allow the kernel to handle things the right way. Patch-from: Oleg Nesterov Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index a6879d169241..463e828f1f31 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -264,7 +264,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; - ret = -EINTR; + ret = -ERESTARTSYS; if (signal_pending(current)) break; -- cgit v1.2.1 From d0e1d66b5aa1ec9f556f951aa9a114cc192cd01c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Dec 2012 16:00:21 -0800 Subject: writeback: remove nr_pages_dirtied arg from balance_dirty_pages_ratelimited_nr() There is no reason to pass the nr_pages_dirtied argument, because nr_pages_dirtied value from the caller is unused in balance_dirty_pages_ratelimited_nr(). Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/file.c | 3 +-- fs/btrfs/ioctl.c | 2 +- fs/ocfs2/file.c | 5 +---- fs/splice.c | 5 +---- 5 files changed, 8 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1e..22a0439e5a86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } @@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed88116..a8ee75cb96ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, cond_resched(); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_pages); + balance_dirty_pages_ratelimited(inode->i_mapping); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28d..5b3429ab8ec1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } defrag_count += ret; - balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + balance_dirty_pages_ratelimited(inode->i_mapping); mutex_unlock(&inode->i_mutex); if (newer_than) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5a4ee77cec51..dda089804942 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, ret = sd.num_spliced; if (ret > 0) { - unsigned long nr_pages; int err; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - err = generic_write_sync(out, *ppos, ret); if (err) ret = err; else *ppos += ret; - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + balance_dirty_pages_ratelimited(mapping); } return ret; diff --git a/fs/splice.c b/fs/splice.c index 13e5b4776e7a..8890604e3fcd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = sd.num_spliced; if (ret > 0) { - unsigned long nr_pages; int err; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - err = generic_write_sync(out, *ppos, ret); if (err) ret = err; else *ppos += ret; - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + balance_dirty_pages_ratelimited(mapping); } sb_end_write(inode->i_sb); -- cgit v1.2.1 From 42d7395feb56f0655cd8b68e06fc6063823449f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 11 Dec 2012 16:01:34 -0800 Subject: mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB There was some desire in large applications using MAP_HUGETLB or SHM_HUGETLB to use 1GB huge pages on some mappings, and stay with 2MB on others. This is useful together with NUMA policy: use 2MB interleaving on some mappings, but 1GB on local mappings. This patch extends the IPC/SHM syscall interfaces slightly to allow specifying the page size. It borrows some upper bits in the existing flag arguments and allows encoding the log of the desired page size in addition to the *_HUGETLB flag. When 0 is specified the default size is used, this makes the change fully compatible. Extending the internal hugetlb code to handle this is straight forward. Instead of a single mount it just keeps an array of them and selects the right mount based on the specified page size. When no page size is specified it uses the mount of the default page size. The change is not visible in /proc/mounts because internal mounts don't appear there. It also has very little overhead: the additional mounts just consume a super block, but not more memory when not used. I also exported the new flags to the user headers (they were previously under __KERNEL__). Right now only symbols for x86 and some other architecture for 1GB and 2MB are defined. The interface should already work for all other architectures though. Only architectures that define multiple hugetlb sizes actually need it (that is currently x86, tile, powerpc). However tile and powerpc have user configurable hugetlb sizes, so it's not easy to add defines. A program on those architectures would need to query sysfs and use the appropiate log2. [akpm@linux-foundation.org: cleanups] [rientjes@google.com: fix build] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen Cc: Michael Kerrisk Acked-by: Rik van Riel Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5bc355d8243..21b8a4875237 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -923,7 +923,7 @@ static struct file_system_type hugetlbfs_fs_type = { .kill_sb = kill_litter_super, }; -static struct vfsmount *hugetlbfs_vfsmount; +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { @@ -932,9 +932,22 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } +static int get_hstate_idx(int page_size_log) +{ + struct hstate *h; + + if (!page_size_log) + return default_hstate_idx; + h = size_to_hstate(1 << page_size_log); + if (!h) + return -1; + return h - hstates; +} + struct file *hugetlb_file_setup(const char *name, unsigned long addr, size_t size, vm_flags_t acctflag, - struct user_struct **user, int creat_flags) + struct user_struct **user, + int creat_flags, int page_size_log) { int error = -ENOMEM; struct file *file; @@ -944,9 +957,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct qstr quick_string; struct hstate *hstate; unsigned long num_pages; + int hstate_idx; + + hstate_idx = get_hstate_idx(page_size_log); + if (hstate_idx < 0) + return ERR_PTR(-ENODEV); *user = NULL; - if (!hugetlbfs_vfsmount) + if (!hugetlbfs_vfsmount[hstate_idx]) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { @@ -963,7 +981,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, } } - root = hugetlbfs_vfsmount->mnt_root; + root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; @@ -971,7 +989,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!path.dentry) goto out_shm_unlock; - path.mnt = mntget(hugetlbfs_vfsmount); + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); error = -ENOSPC; inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) @@ -1011,8 +1029,9 @@ out_shm_unlock: static int __init init_hugetlbfs_fs(void) { + struct hstate *h; int error; - struct vfsmount *vfsmount; + int i; error = bdi_init(&hugetlbfs_backing_dev_info); if (error) @@ -1029,14 +1048,26 @@ static int __init init_hugetlbfs_fs(void) if (error) goto out; - vfsmount = kern_mount(&hugetlbfs_fs_type); + i = 0; + for_each_hstate(h) { + char buf[50]; + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - if (!IS_ERR(vfsmount)) { - hugetlbfs_vfsmount = vfsmount; - return 0; - } + snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, + buf); - error = PTR_ERR(vfsmount); + if (IS_ERR(hugetlbfs_vfsmount[i])) { + pr_err("hugetlb: Cannot mount internal hugetlbfs for " + "page size %uK", ps_kb); + error = PTR_ERR(hugetlbfs_vfsmount[i]); + hugetlbfs_vfsmount[i] = NULL; + } + i++; + } + /* Non default hstates are optional */ + if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); @@ -1047,13 +1078,19 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { + struct hstate *h; + int i; + + /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hugetlbfs_inode_cachep); - kern_unmount(hugetlbfs_vfsmount); + i = 0; + for_each_hstate(h) + kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); bdi_destroy(&hugetlbfs_backing_dev_info); } -- cgit v1.2.1 From 0865935598bb112a02f40017e8aaa6bce8577f23 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:02:00 -0800 Subject: mm: use vm_unmapped_area() in hugetlbfs Update the hugetlb_get_unmapped_area function to make use of vm_unmapped_area() instead of implementing a brute force search. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 42 ++++++++---------------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 21b8a4875237..47e6e2f21e21 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return addr; } - if (len > mm->cached_hole_size) - start_addr = mm->free_area_cache; - else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } #endif -- cgit v1.2.1 From 78bd52097d04205a33a8014a1b8ac01cf1ae9d06 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:31 -0800 Subject: mm: adjust address_space_operations.migratepage() return code Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch-set follows the main idea discussed at 2012 LSFMMS session: "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/ to introduce the required changes to the virtio_balloon driver, as well as the changes to the core compaction & migration bits, in order to make those subsystems aware of ballooned pages and allow memory balloon pages become movable within a guest, thus avoiding the aforementioned fragmentation issue Following are numbers that prove this patch benefits on allowing compaction to be more effective at memory ballooned guests. Results for STRESS-HIGHALLOC benchmark, from Mel Gorman's mmtests suite, running on a 4gB RAM KVM guest which was ballooning 512mB RAM in 64mB chunks, at every minute (inflating/deflating), while test was running: ===BEGIN stress-highalloc STRESS-HIGHALLOC highalloc-3.7 highalloc-3.7 rc4-clean rc4-patch Pass 1 55.00 ( 0.00%) 62.00 ( 7.00%) Pass 2 54.00 ( 0.00%) 62.00 ( 8.00%) while Rested 75.00 ( 0.00%) 80.00 ( 5.00%) MMTests Statistics: duration 3.7 3.7 rc4-clean rc4-patch User 1207.59 1207.46 System 1300.55 1299.61 Elapsed 2273.72 2157.06 MMTests Statistics: vmstat 3.7 3.7 rc4-clean rc4-patch Page Ins 3581516 2374368 Page Outs 11148692 10410332 Swap Ins 80 47 Swap Outs 3641 476 Direct pages scanned 37978 33826 Kswapd pages scanned 1828245 1342869 Kswapd pages reclaimed 1710236 1304099 Direct pages reclaimed 32207 31005 Kswapd efficiency 93% 97% Kswapd velocity 804.077 622.546 Direct efficiency 84% 91% Direct velocity 16.703 15.682 Percentage direct scans 2% 2% Page writes by reclaim 79252 9704 Page writes file 75611 9228 Page writes anon 3641 476 Page reclaim immediate 16764 11014 Page rescued immediate 0 0 Slabs scanned 2171904 2152448 Direct inode steals 385 2261 Kswapd inode steals 659137 609670 Kswapd skipped wait 1 69 THP fault alloc 546 631 THP collapse alloc 361 339 THP splits 259 263 THP fault fallback 98 50 THP collapse fail 20 17 Compaction stalls 747 499 Compaction success 244 145 Compaction failures 503 354 Compaction pages moved 370888 474837 Compaction move failure 77378 65259 ===END stress-highalloc This patch: Introduce MIGRATEPAGE_SUCCESS as the default return code for address_space_operations.migratepage() method and documents the expected return code for the same method in failure cases. Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 47e6e2f21e21..4a55f35a6ced 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -582,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, newpage, page); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) -- cgit v1.2.1 From 252aa6f5be64c90c67b9f066ccff880f6b487d32 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:35 -0800 Subject: mm: redefine address_space.assoc_mapping Overhaul struct address_space.assoc_mapping renaming it to address_space.private_data and its type is redefined to void*. By this approach we consistently name the .private_* elements from struct address_space as well as allow extended usage for address_space association with other data structures through ->private_data. Also, all users of old ->assoc_mapping element are converted to reflect its new name and type change (->private_data). Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 12 ++++++------ fs/gfs2/glock.c | 2 +- fs/inode.c | 2 +- fs/nilfs2/page.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index ec0aca8ba6bf..6e9ed48064fc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -555,7 +555,7 @@ void emergency_thaw_all(void) */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; if (buffer_mapping == NULL || list_empty(&mapping->private_list)) return 0; @@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_page->mapping; mark_buffer_dirty(bh); - if (!mapping->assoc_mapping) { - mapping->assoc_mapping = buffer_mapping; + if (!mapping->private_data) { + mapping->private_data = buffer_mapping; } else { - BUG_ON(mapping->assoc_mapping != buffer_mapping); + BUG_ON(mapping->private_data != buffer_mapping); } if (!bh->b_assoc_map) { spin_lock(&buffer_mapping->private_lock); @@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) @@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e6c2fd53cab2..0f22d09f358d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = s->s_bdi; mapping->writeback_index = 0; } diff --git a/fs/inode.c b/fs/inode.c index 64999f144153..14084b72b259 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3e7b2a0dc0c8..07f76db04ec7 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = bdi; mapping->a_ops = &empty_aops; } -- cgit v1.2.1 From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e28356a959a..aa63d25157b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; - int oom_score_adj = OOM_SCORE_ADJ_MIN; + short oom_score_adj = OOM_SCORE_ADJ_MIN; unsigned long flags; size_t len; @@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, unlock_task_sighand(task, &flags); } put_task_struct(task); - len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } @@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj_min && + if ((short)oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; } - task->signal->oom_score_adj = oom_score_adj; + task->signal->oom_score_adj = (short)oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = oom_score_adj; + task->signal->oom_score_adj_min = (short)oom_score_adj; trace_oom_score_adj_update(task); err_sighand: -- cgit v1.2.1 From 67fad106a219e083c91c79695bd1807dde1bf7b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 12 Dec 2012 11:38:44 -0500 Subject: nfs: don't zero out the rest of the page if we hit the EOF on a DIO READ Eryu provided a test program that would segfault when attempting to read past the EOF on file that was opened O_DIRECT. The buffer given to the read() call was on the stack, and when he attempted to read past it it would scribble over the rest of the stack page. If we hit the end of the file on a DIO READ request, then we don't want to zero out the rest of the buffer. These aren't pagecache pages after all, and there's no guarantee that the buffers that were passed in represent entire pages. Cc: # v3.5+ Cc: Fred Isaman Reported-by: Eryu Guan Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index cae26cbd59ee..594f4e7e0b9a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -266,14 +266,6 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { - if (bytes > hdr->good_bytes) - zero_user(page, 0, PAGE_SIZE); - else if (hdr->good_bytes - bytes < PAGE_SIZE) - zero_user_segment(page, - hdr->good_bytes & ~PAGE_MASK, - PAGE_SIZE); - } if (!PageCompound(page)) { if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes < hdr->good_bytes) -- cgit v1.2.1 From be7e985804c610fcdcee8730cf42718b8a4e1c41 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 12 Dec 2012 12:36:31 -0500 Subject: nfs: fix page dirtying in NFS DIO read codepath The NFS DIO code will dirty pages that catch read responses in order to handle the case where someone is doing DIO reads into an mmapped buffer. The existing code doesn't really do the right thing though since it doesn't take into account the case where we might be attempting to read past the EOF. Fix the logic in that code to only dirty pages that ended up receiving data from the read. Note too that it really doesn't matter if NFS_IOHDR_ERROR is set or not. All that matters is if the page was altered by the read. Cc: Fred Isaman Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 594f4e7e0b9a..0bd7a55a5f07 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -266,13 +266,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (!PageCompound(page)) { - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - if (bytes < hdr->good_bytes) - set_page_dirty(page); - } else - set_page_dirty(page); - } + if (!PageCompound(page) && bytes < hdr->good_bytes) + set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); nfs_direct_readpage_release(req); -- cgit v1.2.1 From eb96d5c97b0825d542e9c4ba5e0a22b519355166 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 27 Nov 2012 10:34:19 -0500 Subject: SUNRPC handle EKEYEXPIRED in call_refreshresult Currently, when an RPCSEC_GSS context has expired or is non-existent and the users (Kerberos) credentials have also expired or are non-existent, the client receives the -EKEYEXPIRED error and tries to refresh the context forever. If an application is performing I/O, or other work against the share, the application hangs, and the user is not prompted to refresh/establish their credentials. This can result in a denial of service for other users. Users are expected to manage their Kerberos credential lifetimes to mitigate this issue. Move the -EKEYEXPIRED handling into the RPC layer. Try tk_cred_retry number of times to refresh the gss_context, and then return -EACCES to the application. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 6 +++--- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/nfs4proc.c | 18 ------------------ fs/nfs/nfs4state.c | 23 ----------------------- fs/nfs/proc.c | 43 ------------------------------------------- 5 files changed, 3 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 69322096c325..70efb63b1e42 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -24,14 +24,14 @@ #define NFSDBG_FACILITY NFSDBG_PROC -/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ +/* A wrapper to handle the EJUKEBOX error messages */ static int nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) { int res; do { res = rpc_call_sync(clnt, msg, flags); - if (res != -EJUKEBOX && res != -EKEYEXPIRED) + if (res != -EJUKEBOX) break; freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; @@ -44,7 +44,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) static int nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { - if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) + if (task->tk_status != -EJUKEBOX) return 0; if (task->tk_status == -EJUKEBOX) nfs_inc_stats(inode, NFSIOS_DELAY); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 1e42413fab8f..194c48410336 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -179,7 +179,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, break; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - case -EKEYEXPIRED: rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); break; case -NFS4ERR_RETRY_UNCACHED_REP: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a4692e97bc19..b0963aeceeda 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -333,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - case -EKEYEXPIRED: ret = nfs4_delay(server->client, &exception->timeout); if (ret != 0) break; @@ -1343,13 +1342,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state nfs_inode_find_state_and_recover(state->inode, stateid); nfs4_schedule_stateid_recovery(server, state); - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ case -ENOMEM: err = 0; goto out; @@ -3946,7 +3938,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: - case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; @@ -4946,15 +4937,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ - err = 0; - goto out; case -ENOMEM: case -NFS4ERR_DENIED: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 78e90a80fc3a..8dcbd9a0367d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1437,14 +1437,6 @@ restart: /* Mark the file as being 'closed' */ state->state = 0; break; - case -EKEYEXPIRED: - /* - * User RPCSEC_GSS context has expired. - * We cannot recover this stateid now, so - * skip it and allow recovery thread to - * proceed. - */ - break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1597,14 +1589,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } -static void nfs4_warn_keyexpired(const char *s) -{ - printk_ratelimited(KERN_WARNING "Error: state manager" - " encountered RPCSEC_GSS session" - " expired against NFSv4 server %s.\n", - s); -} - static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { @@ -1638,10 +1622,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); break; - case -EKEYEXPIRED: - /* Nothing we can do */ - nfs4_warn_keyexpired(clp->cl_hostname); - break; default: dprintk("%s: failed to handle error %d for server %s\n", __func__, error, clp->cl_hostname); @@ -1758,8 +1738,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) dprintk("%s: exit with error %d for server %s\n", __func__, -EPROTONOSUPPORT, clp->cl_hostname); return -EPROTONOSUPPORT; - case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: @@ -1912,7 +1890,6 @@ again: break; case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ status = -EKEYEXPIRED; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 50a88c3546ed..f084dac948e1 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -46,39 +46,6 @@ #define NFSDBG_FACILITY NFSDBG_PROC -/* - * wrapper to handle the -EKEYEXPIRED error message. This should generally - * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't - * support the NFSERR_JUKEBOX error code, but we handle this situation in the - * same way that we handle that error with NFSv3. - */ -static int -nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) -{ - int res; - do { - res = rpc_call_sync(clnt, msg, flags); - if (res != -EKEYEXPIRED) - break; - freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); - res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); - return res; -} - -#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags) - -static int -nfs_async_handle_expired_key(struct rpc_task *task) -{ - if (task->tk_status != -EKEYEXPIRED) - return 0; - task->tk_status = 0; - rpc_restart_call(task); - rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); - return 1; -} - /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { - if (nfs_async_handle_expired_key(task)) - return 0; nfs_mark_for_revalidate(dir); return 1; } @@ -385,8 +350,6 @@ static int nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, struct inode *new_dir) { - if (nfs_async_handle_expired_key(task)) - return 0; nfs_mark_for_revalidate(old_dir); nfs_mark_for_revalidate(new_dir); return 1; @@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { struct inode *inode = data->header->inode; - if (nfs_async_handle_expired_key(task)) - return -EAGAIN; - nfs_invalidate_atime(inode); if (task->tk_status >= 0) { nfs_refresh_inode(inode, data->res.fattr); @@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->header->inode; - if (nfs_async_handle_expired_key(task)) - return -EAGAIN; - if (task->tk_status >= 0) nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; -- cgit v1.2.1 From 0253f40ef9a709a1af39ce38b1d998af090f8127 Mon Sep 17 00:00:00 2001 From: "jeff.liu" Date: Sat, 27 Oct 2012 12:06:39 +0000 Subject: Btrfs: Remove the invalid shrink size check up from btrfs_shrink_dev() Remove an invalid size check up from btrfs_shrink_dev(). The new size should not larger than the device->total_bytes as it was already verified before coming to here(i.e. new_size < old_size). Remove invalid check up for btrfs_shrink_dev(). Signed-off-by: Jie Liu Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/volumes.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28d..14c0d2e0790c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1409,7 +1409,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, btrfs_commit_transaction(trans, root); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); - } + } /* equal, nothing need to do */ out_free: kfree(vol_args); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eaaf0bf52791..32a88428f6da 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3059,9 +3059,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; - if (new_size >= device->total_bytes) - return -EINVAL; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.1 From d1423248734df6d9aff769abffd675dc034e0601 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 31 Oct 2012 15:16:32 +0000 Subject: Btrfs: Fix typo in fs/btrfs Correct spelling typo in btrfs. Signed-off-by: Masanari Iida Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/volumes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cad16566da37..2d41cb25266b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -413,7 +413,7 @@ struct btrfs_root_backup { __le64 bytes_used; __le64 num_devices; /* future */ - __le64 unsed_64[4]; + __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a88428f6da..eeed97d19dee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4261,7 +4261,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_size); -- cgit v1.2.1 From 292fd7fc39aa06668f3a8db546714e727120cb3e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 30 Oct 2012 17:16:16 +0000 Subject: Btrfs: don't allow degraded mount if too many devices are missing The current behavior is to allow mounting or remounting a filesystem writeable in degraded mode if at least one writeable device is present. The next failed write access to a missing device which is above the tolerance of the configured level of redundancy results in an read-only enforcement. Even without this, the next time barrier_all_devices() is called and more devices are missing than tolerable, the switch to read-only mode takes place. In order to behave predictably and to provide proper feedback to the user at mount time, this patch compares the number of missing devices with the number of devices that are tolerated to be missing according to the configured RAID level. If more devices are missing than tolerated, e.g. if two devices are missing in case of RAID1, only a read-only mount and remount is allowed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 +++++++ fs/btrfs/super.c | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd70c2852ba0..064315990f8a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2508,6 +2508,13 @@ retry_root_backup: } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable mount is not allowed\n"); + goto fail_block_groups; + } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14c2064..acd2df85bed5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1226,6 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable remount is not allowed\n"); + ret = -EACCES; + goto restore; + } + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; -- cgit v1.2.1 From 183f37fa3503332740c76f1b493f4304ec889358 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:47 +0000 Subject: Btrfs: do not log extents when we only log new names When we log new names, we need to log just enough to recreate the inode during log replay, and there is no need to log extents along with it. This actually fixes a bug revealed by xfstests 241, where it shows that we're logging some extents that have not updated metadata, so we don't get proper EXTENT_DATA items to be copied to log tree. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d9677a..4ec41ecb4d65 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3435,7 +3435,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { - fast_search = true; + if (inode_only == LOG_INODE_ALL) + fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, BTRFS_XATTR_ITEM_KEY); -- cgit v1.2.1 From 9f3959c53d57d010ae6f4205fbd0159cb7976a83 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:48 +0000 Subject: Btrfs: get right arguments for btrfs_wait_ordered_range btrfs_wait_ordered_range expects for 'len' instead of 'end'. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed88116..d2df98124d0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1562,7 +1562,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end); + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* -- cgit v1.2.1 From 4fde183d8c755f8a8bdffcb03a8d947e62ccea6a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:49 +0000 Subject: Btrfs: cleanup for btrfs_wait_order_range Variable 'found' is no more used. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index eecc20f14cfa..f10731297040 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -653,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; - int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -689,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; - found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -702,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); -- cgit v1.2.1 From b7d5b0a819498a9c04e1d18201a42468f7edd92a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:32:18 +0000 Subject: Btrfs: fix joining the same transaction handler more than 2 times If we flush inodes with pending delalloc in a transaction, we may join the same transaction handler more than 2 times. The reason is: Task use_count of trans handle commit_transaction 1 |-> btrfs_start_delalloc_inodes 1 |-> run_delalloc_nocow 1 |-> join_transaction 2 |-> cow_file_range 2 |-> join_transaction 3 In fact, cow_file_range needn't join the transaction again because the caller have joined the transaction, so we fix this problem by this way. Reported-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 77 ++++++++++++++++++++++++++++++-------------------- fs/btrfs/transaction.c | 1 + 2 files changed, 48 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dce9e218b845..96d20903beeb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -804,14 +804,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) +static noinline int __cow_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -824,25 +824,10 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(btrfs_is_free_space_inode(inode)); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - ret = 0; /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && @@ -953,11 +938,9 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } - ret = 0; out: - btrfs_end_transaction(trans, root); - return ret; + out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -972,6 +955,39 @@ out_unlock: goto out; } +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + ret = __cow_file_range(trans, inode, root, locked_page, start, end, + page_started, nr_written, unlock); + + btrfs_end_transaction(trans, root); + + return ret; +} + /* * work queue call back to started compression on a file and pages */ @@ -1282,9 +1298,9 @@ out_check: btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, - found_key.offset - 1, page_started, - nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1353,8 +1369,9 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, end, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 259f74eabdb8..44a5d73fddbe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -312,6 +312,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); h = current->journal_info; h->use_count++; + WARN_ON(h->use_count > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; -- cgit v1.2.1 From ca46963718ef7368c84267c9f5e7394c3890442a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:14 +0000 Subject: Btrfs: fix missing flush when committing a transaction Consider the following case: Task1 Task2 start_transaction commit_transaction check pending snapshots list and the list is empty. add pending snapshot into list skip the delalloc flush end_transaction ... And then the problem that the snapshot is different with the source subvolume happen. This patch fixes the above problem by flush all pending stuffs when all the other tasks end the transaction. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 82 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 44a5d73fddbe..bc1f52397334 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1399,6 +1399,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } +static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + int snap_pending = 0; + int ret; + + if (!flush_on_commit) { + spin_lock(&root->fs_info->trans_lock); + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + spin_unlock(&root->fs_info->trans_lock); + } + + if (flush_on_commit || snap_pending) { + btrfs_start_delalloc_inodes(root, 1); + btrfs_wait_ordered_extents(root, 1); + } + + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + + /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + return 0; +} + /* * btrfs_transaction state sequence: * in_commit = 0, blocked = 0 (initial) @@ -1416,7 +1458,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); ret = btrfs_run_ordered_operations(root, 0); if (ret) { @@ -1495,47 +1536,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, should_grow = 1; do { - int snap_pending = 0; - joined = cur_trans->num_joined; - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - btrfs_wait_ordered_extents(root, 1); - } - - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -1548,6 +1556,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; + /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting -- cgit v1.2.1 From 315a9850da2b89c83971b26fe54a60f22bdd91ad Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:59 +0000 Subject: Btrfs: fix wrong file extent length There are two types of the file extent - inline extent and regular extent, When we log file extents, we didn't take inline extent into account, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/file-item.c | 21 ++++++++++++++++++++- fs/btrfs/tree-log.c | 10 ++-------- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2d41cb25266b..f9a078661ebc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3263,6 +3263,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); +u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ad08e4e4a15..bd38cef42358 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -133,7 +133,6 @@ fail: return ERR_PTR(ret); } - int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +u64 btrfs_file_extent_length(struct btrfs_path *path) +{ + int extent_type; + struct btrfs_file_extent_item *fi; + u64 len; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(path->nodes[0], fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) + len = btrfs_file_extent_num_bytes(path->nodes[0], fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + len = btrfs_file_extent_inline_len(path->nodes[0], fi); + else + BUG(); + + return len; +} static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ec41ecb4d65..bcf0e48b1932 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3143,7 +3143,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct log_args *args) { struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; u64 search_start = start; @@ -3199,10 +3198,7 @@ again: } } while (key.offset > start); - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], - fi); + num_bytes = btrfs_file_extent_length(path); if (key.offset + num_bytes <= start) { btrfs_release_path(path); return -ENOENT; @@ -3211,8 +3207,7 @@ again: args->src = path->nodes[0]; next_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - fi = btrfs_item_ptr(args->src, path->slots[0], - struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_length(path); if (args->nr && args->start_slot + args->nr == path->slots[0]) { args->nr++; @@ -3230,7 +3225,6 @@ next_slot: } nritems = btrfs_header_nritems(path->nodes[0]); path->slots[0]++; - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); if (len < num_bytes) { /* I _think_ this is ok, envision we write to a * preallocated space that is adjacent to a previously -- cgit v1.2.1 From bbe1426764e5dfaa57e7b12cc954acdb3fb7f94b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:34:54 +0000 Subject: Btrfs: fix unprotected extent map operation when logging file extents We forget to protect the modified_extents list, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcf0e48b1932..d1947af67bcd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3526,8 +3526,10 @@ next_slot: struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; + write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) list_del_init(&em->list); + write_unlock(&tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v1.2.1 From 5269b67e3d809dcaa4c6763a343423bb1b7b3fe6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:35:23 +0000 Subject: Btrfs: fix missing log when BTRFS_INODE_NEEDS_FULL_SYNC is set If we set BTRFS_INODE_NEEDS_FULL_SYNC, we should log all the extent, but now we forget to take it into account, and set a wrong max key, if so, we will skip the file extent metadata when doing logging. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1947af67bcd..40b9efd20e43 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3394,7 +3394,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* today the code can only do partial logging of directories */ - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; -- cgit v1.2.1 From 31b1a2bd758f439fc945b3ac5899d890cb7e2dc6 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:34 +0000 Subject: fs/btrfs: use WARN Use WARN rather than printk followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 19 +++++++------------ fs/btrfs/disk-io.c | 6 ++---- fs/btrfs/extent-tree.c | 7 +++---- fs/btrfs/extent_io.c | 9 +++------ fs/btrfs/inode.c | 3 +-- fs/btrfs/transaction.c | 12 ++++-------- fs/btrfs/volumes.c | 3 +-- 7 files changed, 21 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 100c274a1cfe..0e4adb00e9d9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1359,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long) root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -3640,11 +3637,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, right_nritems); - WARN_ON(1); - } if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 064315990f8a..07a2162cdd65 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3397,14 +3397,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", (unsigned long long)buf->start, (unsigned long long)transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b495cb4b9b2b..0bcb9543da60 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6292,10 +6292,9 @@ use_block_rsv(struct btrfs_trans_handle *trans, static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", + ret); ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 472873a94d96..3c062c8d1d70 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", + if (end < start) + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", (unsigned long long)end, (unsigned long long)start); - WARN_ON(1); - } state->start = start; state->end = end; @@ -4721,10 +4719,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " "wanted %lu %lu\n", (unsigned long long)eb->start, eb->len, start, min_len); - WARN_ON(1); return -EINVAL; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 96d20903beeb..6dca345fd1b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5458,8 +5458,7 @@ again: extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } else { - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); - WARN_ON(1); + WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc1f52397334..f21f39f0b1a1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -145,16 +145,12 @@ loop: * the log must never go across transaction boundaries. */ smp_mb(); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } - if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { - printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } atomic_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->commit_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eeed97d19dee..3f4bfee66d7b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3323,9 +3323,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, cur = cur->next; if (!device->writeable) { - printk(KERN_ERR + WARN(1, KERN_ERR "btrfs: read-only device in alloc_list\n"); - WARN_ON(1); continue; } -- cgit v1.2.1 From 6c1500f22a7be3a24ad3dffcdbf04be3f676521b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 20:30:18 +0000 Subject: fs/btrfs: drop if around WARN_ON Just use WARN_ON rather than an if containing only WARN_ON(1). A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression e; @@ - if (e) WARN_ON(1); + WARN_ON(e); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 3 +-- fs/btrfs/ctree.c | 9 +++------ fs/btrfs/inode.c | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 208d8aa5b07e..a3219523ebc9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -890,8 +890,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); + WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0e4adb00e9d9..5c2cf992e717 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1464,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (cache_only && parent_level != 1) return 0; - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); @@ -3398,8 +3396,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (push_items == 0) goto out_unlock; - if (!empty && push_items == left_nritems) - WARN_ON(1); + WARN_ON(!empty && push_items == left_nritems); /* push left to right */ right_nritems = btrfs_header_nritems(right); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dca345fd1b6..e8733fab2739 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1675,8 +1675,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) - WARN_ON(1); + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } -- cgit v1.2.1 From 37c4146d2208ba7e4463e8dd95a1bf9e3d865280 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:08 +0000 Subject: Btrfs: fix a deadlock in aborting transaction due to ENOSPC When committing a transaction, we may bail out of running delayed refs due to ENOSPC, and then abort the current transaction to flip into readonly. But we'll hit a deadlock on ref head's lock since we forget to release its lock and other cleanup stuff. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0bcb9543da60..f8a358aee060 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2297,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, kfree(extent_op); if (ret) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; @@ -2339,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, count++; if (ret) { + if (locked_ref) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + } printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; -- cgit v1.2.1 From 109f2365f1928af241b2ccbd0f6ba0b93d911288 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:09 +0000 Subject: Btrfs: fix a double free on pending snapshots in error handling When creating a snapshot, failing to commit a transaction can end up with aborting the transaction, following by doing a cleanup for it, where we'll free all snapshots pending to disk. So we check it and avoid double free on pending snapshots. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14c0d2e0790c..e262cd8c4a7d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -571,8 +571,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) + if (ret) { + /* cleanup_transaction has freed this for us */ + if (trans->aborted) + pending_snapshot = NULL; goto fail; + } ret = pending_snapshot->error; if (ret) -- cgit v1.2.1 From d03f918ab9036cc71740c0aa796c8e02e6f6f6d3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 13:10:49 +0000 Subject: Btrfs: Don't trust the superblock label and simply printk("%s") it Someone who is root or capable(CAP_SYS_ADMIN) could corrupt the superblock and make Btrfs printk("%s") crash while holding the uuid_mutex since nobody forces a limit on the string. Since the uuid_mutex is significant, the system would be unusable afterwards. Signed-off-by: Stefan Behrens Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f4bfee66d7b..db79fb7e7e91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -764,10 +764,13 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "device label %s ", disk_super->label); - else + } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); + } printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); -- cgit v1.2.1 From e1f5790e0588bc5b11eb57f95bfde8702049dd0d Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 8 Nov 2012 04:47:33 +0000 Subject: Btrfs: set hole punching time properly Even if the hole punching is executed, the modification time of the file is not updated. So, current time is set to inode. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d2df98124d0f..883cf826cf25 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1964,6 +1964,9 @@ out_trans: if (!trans) goto out_free; + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); nr = trans->blocks_used; -- cgit v1.2.1 From 3ef5969cd8a42a78ccdbc53f7abb2e6136b2ec65 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Thu, 8 Nov 2012 21:27:24 +0000 Subject: Btrfs: merge inode_list in __merge_refs When __merge_refs merges two refs, it is also needed to merge the inode_list of both refs. Otherwise we have missed backrefs and memory leaks. This happens for example if two inodes share an extent and both lie in the same leaf and thus also have the same parent. Signed-off-by: Alexander Block Reviewed-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a3219523ebc9..04edf69be875 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode) pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; struct __prelim_ref *xchg; + struct extent_inode_elem *eie; ref2 = list_entry(pos2, struct __prelim_ref, list); @@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = ref2; ref2 = xchg; } - ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) continue; - ref1->count += ref2->count; } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + list_del(&ref2->list); kfree(ref2); } -- cgit v1.2.1 From b53d3f5db2b79637acadc06a330db6c2c60863f5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:34:34 +0000 Subject: Btrfs: cleanup for btrfs_btree_balance_dirty - 'nr' is no more used. - btrfs_btree_balance_dirty() and __btrfs_btree_balance_dirty() can share a bunch of code. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 5 +---- fs/btrfs/disk-io.c | 29 ++++++++++------------------- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/file.c | 9 +++------ fs/btrfs/inode.c | 42 +++++++++++------------------------------- fs/btrfs/relocation.c | 22 ++++++---------------- fs/btrfs/transaction.c | 4 +--- 7 files changed, 34 insertions(+), 81 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0c6dca550ea1..34836036f01b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1257,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1318,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) delayed_node); mutex_unlock(&delayed_node->mutex); - nr = trans->blocks_used; - trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); - __btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty_nodelay(root); free_path: btrfs_free_path(path); out: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07a2162cdd65..ff5d259ac275 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3411,7 +3411,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) } } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) { /* * looks as though older kernels can get into trouble with @@ -3423,7 +3424,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) if (current->flags & PF_MEMALLOC) return; - btrfs_balance_delayed_items(root); + if (flush_delayed) + btrfs_balance_delayed_items(root); num_dirty = root->fs_info->dirty_metadata_bytes; @@ -3434,25 +3436,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) return; } -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +void btrfs_btree_balance_dirty(struct btrfs_root *root) { - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - num_dirty = root->fs_info->dirty_metadata_bytes; + __btrfs_btree_balance_dirty(root, 1); +} - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2025a9132c16..305c33efb0e3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 883cf826cf25..bd7f1b01e051 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1349,7 +1349,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, balance_dirty_pages_ratelimited_nr(inode->i_mapping, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); + btrfs_btree_balance_dirty(root); pos += copied; num_written += copied; @@ -1803,7 +1803,6 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; - unsigned long nr; int ret = 0; int err = 0; bool same_page = (offset >> PAGE_CACHE_SHIFT) == @@ -1931,9 +1930,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { @@ -1969,9 +1967,8 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out_free: btrfs_free_path(path); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8733fab2739..aabf747d056e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3091,7 +3091,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = dentry->d_inode; int ret; - unsigned long nr = 0; trans = __unlink_start_trans(dir, dentry); if (IS_ERR(trans)) @@ -3111,9 +3110,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return ret; } @@ -3203,7 +3201,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - unsigned long nr = 0; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -3232,9 +3229,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -3800,7 +3796,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - unsigned long nr; int ret; trace_btrfs_inode_evict(inode); @@ -3882,10 +3877,9 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } btrfs_free_block_rsv(root, rsv); @@ -3901,9 +3895,8 @@ void btrfs_evict_inode(struct inode *inode) root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); no_delete: clear_inode(inode); return; @@ -4915,7 +4908,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - unsigned long nr = 0; u64 index = 0; if (!new_valid_dev(rdev)) @@ -4965,9 +4957,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4983,7 +4974,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int drop_inode = 0; int err; - unsigned long nr = 0; u64 objectid; u64 index = 0; @@ -5033,13 +5023,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5050,7 +5039,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = old_dentry->d_inode; u64 index; - unsigned long nr = 0; int err; int drop_inode = 0; @@ -5094,14 +5082,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5114,7 +5101,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int drop_on_err = 0; u64 objectid = 0; u64 index = 0; - unsigned long nr = 1; /* * 2 items for inode and ref @@ -5160,11 +5146,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) drop_on_err = 0; out_fail: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -6872,7 +6857,6 @@ static int btrfs_truncate(struct inode *inode) int ret; int err = 0; struct btrfs_trans_handle *trans; - unsigned long nr; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); @@ -6995,9 +6979,8 @@ static int btrfs_truncate(struct inode *inode) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { @@ -7031,9 +7014,8 @@ static int btrfs_truncate(struct inode *inode) if (ret && !err) err = ret; - nr = trans->blocks_used; ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } out: @@ -7594,7 +7576,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - unsigned long nr = 0; name_len = strlen(symname) + 1; if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) @@ -7692,13 +7673,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: if (!err) d_instantiate(dentry, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 270f24ffe1be..300e09ac3659 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; - unsigned long nr; int level; int max_level; int replaced = 0; @@ -2126,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path->slots[level]); root_item->drop_level = level; - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -2156,10 +2154,9 @@ out: btrfs_update_reloc_root(trans, root); } - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3262,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; - unsigned long nr; int ret = 0; if (inode) @@ -3296,9 +3292,8 @@ truncate: ret = btrfs_truncate_free_space_cache(root, trans, path, inode); btrfs_free_path(path); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out: iput(inode); return ret; @@ -3715,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; - unsigned long nr; u64 flags; u32 item_size; int ret; @@ -3832,9 +3826,8 @@ restart: ret = btrfs_commit_transaction(trans, rc->extent_root); BUG_ON(ret); } else { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } trans = NULL; @@ -3864,9 +3857,8 @@ restart: GFP_NOFS); if (trans) { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } if (!err) { @@ -3945,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - unsigned long nr; u64 objectid = BTRFS_FIRST_FREE_OBJECTID; int err = 0; @@ -3973,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, inode); out: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (err) { if (inode) iput(inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f21f39f0b1a1..7b297354e738 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -952,7 +952,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - unsigned long nr; if (xchg(&root->defrag_running, 1)) return 0; @@ -964,9 +963,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) ret = btrfs_defrag_leaves(trans, root, cacheonly); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root, nr); + btrfs_btree_balance_dirty(info->tree_root); cond_resched(); if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) -- cgit v1.2.1 From d25628bdd66aedd6e07729d8dc6c8ee846d66d72 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:35:30 +0000 Subject: Btrfs: protect devices list with its mutex Since we've kill the bigger one volume_mutex, we need to add devices list mutex back. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db79fb7e7e91..92e586bc8004 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1681,16 +1681,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); goto error; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) { -- cgit v1.2.1 From d9d181c1ba7aa09a6d2698e8c7e75b515524d504 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 09:58:09 +0100 Subject: Btrfs: rename the scrub context structure The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit is a first preparation step to adapt the scrub code to be shareable for the device replace procedure. The block device will be removed from the scrub context state structure in a later step. It used to be the source block device. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in a later patch, and therefore the context structure is renamed (this is the goal of the current patch) to reflect that no source block device scope is there anymore. Summary: This first preparation step consists of a textual substitution of the term "dev" to the term "ctx" whereever the scrub context is used. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 504 ++++++++++++++++++++++++++--------------------------- fs/btrfs/volumes.h | 2 +- 2 files changed, 253 insertions(+), 253 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69b..29c8aac5bda7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -42,10 +42,10 @@ */ struct scrub_block; -struct scrub_dev; +struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -66,7 +66,7 @@ struct scrub_page { struct scrub_bio { int index; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct bio *bio; int err; u64 logical; @@ -82,7 +82,7 @@ struct scrub_block { int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct { unsigned int header_error:1; unsigned int checksum_error:1; @@ -91,8 +91,8 @@ struct scrub_block { }; }; -struct scrub_dev { - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; struct btrfs_device *dev; int first_free; int curr; @@ -116,7 +116,7 @@ struct scrub_dev { }; struct scrub_fixup_nodatasum { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -138,7 +138,7 @@ struct scrub_warning { static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); @@ -163,9 +163,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); @@ -173,27 +173,27 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_free_csums(struct scrub_dev *sdev) +static void scrub_free_csums(struct scrub_ctx *sctx) { - while (!list_empty(&sdev->csum_list)) { + while (!list_empty(&sctx->csum_list)) { struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sdev->csum_list, + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); list_del(&sum->list); kfree(sum); } } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; - if (!sdev) + if (!sctx) return; /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { BUG_ON(!sbio->pagev[i]); @@ -203,69 +203,69 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio = sdev->bios[i]; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) break; kfree(sbio); } - scrub_free_csums(sdev); - kfree(sdev); + scrub_free_csums(sctx); + kfree(sctx); } static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; int pages_per_bio; pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, bio_get_nr_vecs(dev->bdev)); - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); - if (!sdev) + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) goto nomem; - sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + sctx->dev = dev; + sctx->pages_per_bio = pages_per_bio; + sctx->curr = -1; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); if (!sbio) goto nomem; - sdev->bios[i] = sbio; + sctx->bios[i] = sbio; sbio->index = i; - sbio->sdev = sdev; + sbio->sctx = sctx; sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_DEV-1) - sdev->bios[i]->next_free = i + 1; + if (i != SCRUB_BIOS_PER_CTX - 1) + sctx->bios[i]->next_free = i + 1; else - sdev->bios[i]->next_free = -1; - } - sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sdev->in_flight, 0); - atomic_set(&sdev->fixup_cnt, 0); - atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sdev->csum_list); - - spin_lock_init(&sdev->list_lock); - spin_lock_init(&sdev->stat_lock); - init_waitqueue_head(&sdev->list_wait); - return sdev; + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->leafsize = dev->dev_root->leafsize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->in_flight, 0); + atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + return sctx; nomem: - scrub_free_dev(sdev); + scrub_free_ctx(sctx); return ERR_PTR(-ENOMEM); } @@ -345,7 +345,7 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sdev->dev; + struct btrfs_device *dev = sblock->sctx->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -530,21 +530,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) { int ret; struct scrub_fixup_nodatasum *fixup; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sdev = fixup->sdev; + sctx = fixup->sctx; fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.malloc_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); uncorrectable = 1; goto out; } @@ -573,22 +573,22 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) } WARN_ON(ret != 1); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fixup->root); if (uncorrectable) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } btrfs_free_path(path); @@ -599,9 +599,9 @@ out: atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sdev->fixup_cnt); + atomic_dec(&sctx->fixup_cnt); wake_up(&fs_info->scrub_pause_wait); - wake_up(&sdev->list_wait); + wake_up(&sctx->list_wait); } /* @@ -614,7 +614,7 @@ out: */ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sblock_to_check->sdev; + struct scrub_ctx *sctx = sblock_to_check->sctx; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +633,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; + fs_info = sctx->dev->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -677,25 +677,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, logical, sblocks_for_recheck); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -704,13 +704,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* build and submit the bios for the failed mirror, check checksums */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); + csum, generation, sctx->csum_size); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,45 +725,45 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * different bio (usually one of the two latter cases is * the cause) */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + spin_unlock(&sctx->stat_lock); goto out; } if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sdev->readonly) + if (sctx->readonly) goto did_not_correct_error; if (!is_metadata && !have_csum) { @@ -779,7 +779,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); if (!fixup_nodatasum) goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; + fixup_nodatasum->sctx = sctx; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -796,7 +796,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sdev->fixup_cnt); + atomic_inc(&sctx->fixup_cnt); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -818,7 +818,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) ret = scrub_recheck_block(fs_info, sblocks_for_recheck + mirror_index, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (ret) goto did_not_correct_error; } @@ -930,7 +930,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (!ret && !sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -939,23 +939,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto did_not_correct_error; } else { corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } } else { did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } out: @@ -978,7 +978,7 @@ out: return 0; } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) @@ -988,7 +988,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, int ret; /* - * note: the three members sdev, ref_count and outstanding_pages + * note: the three members sctx, ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1028,9 +1028,9 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } @@ -1259,14 +1259,14 @@ static void scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; u64 len; int index; @@ -1278,7 +1278,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0].page; buffer = kmap_atomic(page); - len = sdev->sectorsize; + len = sctx->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1296,7 +1296,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) } btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sctx->csum_size)) fail = 1; return fail; @@ -1304,9 +1304,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) static int scrub_checksum_tree_block(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1324,7 +1324,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; - memcpy(on_disk_csum, h->csum, sdev->csum_size); + memcpy(on_disk_csum, h->csum, sctx->csum_size); /* * we don't use the getter functions here, as we @@ -1345,8 +1345,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sdev->nodesize != sdev->leafsize); - len = sdev->nodesize - BTRFS_CSUM_SIZE; + BUG_ON(sctx->nodesize != sctx->leafsize); + len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -1368,7 +1368,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++crc_fail; return fail || crc_fail; @@ -1377,8 +1377,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - struct scrub_dev *sdev = sblock->sdev; - struct btrfs_root *root = sdev->dev->dev_root; + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1396,7 +1396,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sdev->csum_size); + memcpy(on_disk_csum, s->csum, sctx->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail_cor; @@ -1429,7 +1429,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -1438,14 +1438,14 @@ static int scrub_checksum_super(struct scrub_block *sblock) * They will get written with the next transaction commit * anyway */ - spin_lock(&sdev->stat_lock); - ++sdev->stat.super_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1469,21 +1469,21 @@ static void scrub_block_put(struct scrub_block *sblock) } } -static void scrub_submit(struct scrub_dev *sdev) +static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; - if (sdev->curr == -1) + if (sctx->curr == -1) return; - sbio = sdev->bios[sdev->curr]; - sdev->curr = -1; - atomic_inc(&sdev->in_flight); + sbio = sctx->bios[sctx->curr]; + sctx->curr = -1; + atomic_inc(&sctx->in_flight); btrfsic_submit_bio(READ, sbio->bio); } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; @@ -1494,20 +1494,20 @@ again: /* * grab a fresh bio or wait for one to become available */ - while (sdev->curr == -1) { - spin_lock(&sdev->list_lock); - sdev->curr = sdev->first_free; - if (sdev->curr != -1) { - sdev->first_free = sdev->bios[sdev->curr]->next_free; - sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->page_count = 0; - spin_unlock(&sdev->list_lock); + while (sctx->curr == -1) { + spin_lock(&sctx->list_lock); + sctx->curr = sctx->first_free; + if (sctx->curr != -1) { + sctx->first_free = sctx->bios[sctx->curr]->next_free; + sctx->bios[sctx->curr]->next_free = -1; + sctx->bios[sctx->curr]->page_count = 0; + spin_unlock(&sctx->list_lock); } else { - spin_unlock(&sdev->list_lock); - wait_event(sdev->list_wait, sdev->first_free != -1); + spin_unlock(&sctx->list_lock); + wait_event(sctx->list_wait, sctx->first_free != -1); } } - sbio = sdev->bios[sdev->curr]; + sbio = sctx->bios[sctx->curr]; if (sbio->page_count == 0) { struct bio *bio; @@ -1515,7 +1515,7 @@ again: sbio->logical = spage->logical; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1523,14 +1523,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; + bio->bi_bdev = sctx->dev->bdev; bio->bi_sector = spage->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != spage->logical) { - scrub_submit(sdev); + scrub_submit(sctx); goto again; } @@ -1542,20 +1542,20 @@ again: sbio->bio = NULL; return -EIO; } - scrub_submit(sdev); + scrub_submit(sctx); goto again; } scrub_block_get(sblock); /* one for the added page */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sdev->pages_per_bio) - scrub_submit(sdev); + if (sbio->page_count == sctx->pages_per_bio) + scrub_submit(sctx); return 0; } -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { @@ -1564,15 +1564,15 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, sblock = kzalloc(sizeof(*sblock), GFP_NOFS); if (!sblock) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); return -ENOMEM; } /* one ref inside this function, plus one for each page later on */ atomic_set(&sblock->ref_count, 1); - sblock->sdev = sdev; + sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { @@ -1582,9 +1582,9 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); spage->page = alloc_page(GFP_NOFS); if (!spage->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); while (index > 0) { index--; __free_page(sblock->pagev[index].page); @@ -1593,7 +1593,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sdev->dev; + spage->dev = sctx->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1601,7 +1601,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sdev->csum_size); + memcpy(spage->csum, csum, sctx->csum_size); } else { spage->have_csum = 0; } @@ -1616,7 +1616,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, struct scrub_page *spage = sblock->pagev + index; int ret; - ret = scrub_add_page_to_bio(sdev, spage); + ret = scrub_add_page_to_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1624,7 +1624,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, } if (force) - scrub_submit(sdev); + scrub_submit(sctx); /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); @@ -1634,8 +1634,8 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct scrub_ctx *sctx = sbio->sctx; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1646,7 +1646,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) static void scrub_bio_end_io_worker(struct btrfs_work *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; + struct scrub_ctx *sctx = sbio->sctx; int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); @@ -1671,12 +1671,12 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) bio_put(sbio->bio); sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + spin_lock(&sctx->list_lock); + sbio->next_free = sctx->first_free; + sctx->first_free = sbio->index; + spin_unlock(&sctx->list_lock); + atomic_dec(&sctx->in_flight); + wake_up(&sctx->list_wait); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1687,7 +1687,7 @@ static void scrub_block_complete(struct scrub_block *sblock) scrub_checksum(sblock); } -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; @@ -1695,15 +1695,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, unsigned long i; unsigned long num_sectors; - while (!list_empty(&sdev->csum_list)) { - sum = list_first_entry(&sdev->csum_list, + while (!list_empty(&sctx->csum_list)) { + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); if (sum->bytenr > logical) return 0; if (sum->bytenr + sum->len > logical) break; - ++sdev->stat.csum_discards; + ++sctx->stat.csum_discards; list_del(&sum->list); kfree(sum); sum = NULL; @@ -1711,10 +1711,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sdev->sectorsize; + num_sectors = sum->len / sctx->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); + memcpy(csum, &sum->sums[i].sum, sctx->csum_size); ret = 1; break; } @@ -1727,7 +1727,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, } /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; @@ -1735,20 +1735,20 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sdev->sectorsize; - spin_lock(&sdev->stat_lock); - sdev->stat.data_extents_scrubbed++; - sdev->stat.data_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + blocksize = sctx->sectorsize; + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed++; + sctx->stat.data_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sdev->nodesize != sdev->leafsize); - blocksize = sdev->nodesize; - spin_lock(&sdev->stat_lock); - sdev->stat.tree_extents_scrubbed++; - sdev->stat.tree_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + BUG_ON(sctx->nodesize != sctx->leafsize); + blocksize = sctx->nodesize; + spin_lock(&sctx->stat_lock); + sctx->stat.tree_extents_scrubbed++; + sctx->stat.tree_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else { - blocksize = sdev->sectorsize; + blocksize = sctx->sectorsize; BUG_ON(1); } @@ -1758,11 +1758,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (flags & BTRFS_EXTENT_FLAG_DATA) { /* push csums to sbio */ - have_csum = scrub_find_csum(sdev, logical, l, csum); + have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) - ++sdev->stat.no_csum; + ++sctx->stat.no_csum; } - ret = scrub_pages(sdev, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1773,11 +1773,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, return 0; } -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1843,8 +1843,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ logical = base + offset; - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1898,7 +1898,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sdev->cancel_req)) { + atomic_read(&sctx->cancel_req)) { ret = -ECANCELED; goto out; } @@ -1907,9 +1907,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - scrub_submit(sdev); - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + scrub_submit(sctx); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -1926,7 +1926,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_lookup_csums_range(csum_root, logical, logical + map->stripe_len - 1, - &sdev->csum_list, 1); + &sctx->csum_list, 1); if (ret) goto out; @@ -2004,7 +2004,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, key.objectid; } - ret = scrub_extent(sdev, key.objectid, key.offset, + ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, flags, generation, mirror_num); if (ret) @@ -2016,12 +2016,12 @@ next: btrfs_release_path(path); logical += increment; physical += map->stripe_len; - spin_lock(&sdev->stat_lock); - sdev->stat.last_physical = physical; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); } /* push queued extents */ - scrub_submit(sdev); + scrub_submit(sctx); out: blk_finish_plug(&plug); @@ -2029,12 +2029,12 @@ out: return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sdev->dev->dev_root->fs_info->mapping_tree; + &sctx->dev->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2055,9 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev && + if (map->stripes[i].dev == sctx->dev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sdev, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, i, chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2069,11 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,7 +2094,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sdev->dev->devid; + key.objectid = sctx->dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; @@ -2117,7 +2117,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sdev->dev->devid) + if (found_key.objectid != sctx->dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2151,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,13 +2170,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sdev->dev; + struct btrfs_device *device = sctx->dev; struct btrfs_root *root = device->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -2189,12 +2189,12 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); return 0; } @@ -2238,7 +2238,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; @@ -2302,41 +2302,41 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, scrub_workers_put(root); return -EINPROGRESS; } - sdev = scrub_setup_dev(dev); - if (IS_ERR(sdev)) { + sctx = scrub_setup_ctx(dev); + if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); scrub_workers_put(root); - return PTR_ERR(sdev); + return PTR_ERR(sctx); } - sdev->readonly = readonly; - dev->scrub_device = sdev; + sctx->readonly = readonly; + dev->scrub_device = sctx; atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sdev); + ret = scrub_supers(sctx); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sdev, start, end); + ret = scrub_enumerate_chunks(sctx, start, end); - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); if (progress) - memcpy(progress, &sdev->stat, sizeof(*progress)); + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; mutex_unlock(&fs_info->scrub_lock); - scrub_free_dev(sdev); + scrub_free_ctx(sctx); scrub_workers_put(root); return ret; @@ -2407,15 +2407,15 @@ int btrfs_scrub_cancel(struct btrfs_root *root) int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = root->fs_info; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); - sdev = dev->scrub_device; - if (!sdev) { + sctx = dev->scrub_device; + if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } - atomic_inc(&sdev->cancel_req); + atomic_inc(&sctx->cancel_req); while (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, @@ -2453,15 +2453,15 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { struct btrfs_device *dev; - struct scrub_dev *sdev = NULL; + struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(root, devid, NULL, NULL); if (dev) - sdev = dev->scrub_device; - if (sdev) - memcpy(progress, &sdev->stat, sizeof(*progress)); + sctx = dev->scrub_device; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53c06af92e8d..1789cda57efb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -88,7 +88,7 @@ struct btrfs_device { u8 uuid[BTRFS_UUID_SIZE]; /* per-device scrub information */ - struct scrub_dev *scrub_device; + struct scrub_ctx *scrub_device; struct btrfs_work work; struct rcu_head rcu; -- cgit v1.2.1 From a36cf8b8933e4a7a7f2f2cbc3c70b097e97f7fd1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 13:26:57 +0100 Subject: Btrfs: remove the block device pointer from the scrub context struct The block device is removed from the scrub context state structure. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in the scrub context struct and moved into the lower level scope of scrub_bio, fixup and page structures where the block device context is known. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 133 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 29c8aac5bda7..822c08a420c2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -67,6 +67,7 @@ struct scrub_page { struct scrub_bio { int index; struct scrub_ctx *sctx; + struct btrfs_device *dev; struct bio *bio; int err; u64 logical; @@ -93,7 +94,7 @@ struct scrub_block { struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; - struct btrfs_device *dev; + struct btrfs_root *dev_root; int first_free; int curr; atomic_t in_flight; @@ -117,6 +118,7 @@ struct scrub_ctx { struct scrub_fixup_nodatasum { struct scrub_ctx *sctx; + struct btrfs_device *dev; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -166,8 +168,8 @@ static void scrub_block_put(struct scrub_block *sblock); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); @@ -228,9 +230,9 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; - sctx->dev = dev; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; + sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; @@ -345,8 +347,8 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sctx->dev; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -361,15 +363,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) const int bufsize = 4096; int ret; + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0].dev; + fs_info = sblock->sctx->dev_root->fs_info; + path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); swarn.sector = (sblock->pagev[0].physical) >> 9; swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; - swarn.dev = dev; + swarn.dev = NULL; swarn.msg_bufsize = bufsize; swarn.scratch_bufsize = bufsize; @@ -405,6 +410,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); } else { swarn.path = path; + swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, scrub_print_warning_inode, &swarn); @@ -588,7 +594,7 @@ out: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -615,6 +621,7 @@ out: static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +640,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sctx->dev->dev_root->fs_info; + fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -643,6 +650,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->pagev[0].have_csum; csum = sblock_to_check->pagev[0].csum; + dev = sblock_to_check->pagev[0].dev; /* * read all mirrors one after the other. This includes to @@ -682,8 +690,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -695,8 +702,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -710,8 +716,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -738,15 +743,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sctx->stat_lock); @@ -756,10 +760,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } @@ -780,6 +784,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!fixup_nodatasum) goto did_not_correct_error; fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -945,7 +950,7 @@ corrected_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -955,7 +960,7 @@ did_not_correct_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } out: @@ -1266,7 +1271,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1306,7 +1311,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1378,7 +1383,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1442,10 +1447,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1513,6 +1518,7 @@ again: sbio->physical = spage->physical; sbio->logical = spage->logical; + sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); @@ -1523,13 +1529,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sctx->dev->bdev; - bio->bi_sector = spage->physical >> 9; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != - spage->logical) { + spage->logical || + sbio->dev != spage->dev) { scrub_submit(sctx); goto again; } @@ -1556,8 +1563,8 @@ again: } static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_block *sblock; int index; @@ -1593,7 +1600,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sctx->dev; + spage->dev = dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1634,8 +1641,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_ctx *sctx = sbio->sctx; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1728,7 +1734,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1762,7 +1769,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1774,10 +1781,12 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, - struct map_lookup *map, int num, u64 base, u64 length) + struct map_lookup *map, + struct btrfs_device *scrub_dev, + int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1797,7 +1806,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct reada_control *reada2; struct btrfs_key key_start; struct btrfs_key key_end; - u64 increment = map->stripe_len; u64 offset; @@ -2006,7 +2014,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, - flags, generation, mirror_num); + scrub_dev, flags, generation, + mirror_num); if (ret) goto out; @@ -2030,11 +2039,13 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + struct btrfs_device *scrub_dev, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sctx->dev->dev_root->fs_info->mapping_tree; + &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2066,10 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sctx->dev && + if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, scrub_dev, i, + chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2081,12 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,11 +2107,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sctx->dev->devid; + key.objectid = scrub_dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -2117,7 +2129,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sctx->dev->devid) + if (found_key.objectid != scrub_dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2163,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,14 +2182,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sctx->dev; - struct btrfs_root *root = device->dev_root; + struct btrfs_root *root = sctx->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return -EIO; @@ -2186,11 +2198,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, 1); if (ret) return ret; } @@ -2317,11 +2330,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx); + ret = scrub_supers(sctx, dev); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sctx, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); -- cgit v1.2.1 From 7a9e9987681198c56ac7f165725ca322d7a196e1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 14:58:04 +0100 Subject: Btrfs: make the scrub page array dynamically allocated With the modified design (in order to support the devive replace procedure) it is necessary to alloc the page array dynamically. The reason is that pages are reused. At first a page is used for the bio to read the data from the filesystem, then the same page is reused for the bio that writes the data to the target disk. Since the read process and the write process are completely decoupled, this requires a new concept of refcounts and get/put functions for pages, and it requires to use newly created pages for each read bio which are freed after the write operation is finished. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 195 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 822c08a420c2..15ac82ae5770 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -46,6 +46,12 @@ struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -56,6 +62,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + atomic_t ref_count; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -79,7 +86,7 @@ struct scrub_bio { }; struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ @@ -165,6 +172,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -364,15 +373,15 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) int ret; WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0].dev; + dev = sblock->pagev[0]->dev; fs_info = sblock->sctx->dev_root->fs_info; path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; swarn.msg_bufsize = bufsize; @@ -642,15 +651,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; - dev = sblock_to_check->pagev[0].dev; + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; /* * read all mirrors one after the other. This includes to @@ -892,7 +901,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; if (!page_bad->io_error) continue; @@ -903,8 +912,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index++) { struct scrub_block *sblock_other = sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; + struct scrub_page *page_other = sblock_other->pagev[ + page_num]; if (!page_other->io_error) { ret = scrub_repair_page_from_good_copy( @@ -971,11 +980,11 @@ out: mirror_index; int page_index; - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + scrub_page_put(sblock->pagev[page_index]); + } } kfree(sblocks_for_recheck); } @@ -993,7 +1002,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, int ret; /* - * note: the three members sctx, ref_count and outstanding_pages + * note: the two members ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1025,21 +1034,27 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, continue; sblock = sblocks_for_recheck + mirror_index; - page = sblock->pagev + page_index; - page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, dev->bdev is NULL */ - page->dev = bbio->stripes[mirror_index].dev; - page->mirror_num = mirror_index + 1; - page->page = alloc_page(GFP_NOFS); - if (!page->page) { + sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); + if (!page) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } + scrub_page_get(page); + sblock->pagev[page_index] = page; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; + page->mirror_num = mirror_index + 1; sblock->page_count++; + page->page = alloc_page(GFP_NOFS); + if (!page->page) + goto leave_nomem; } kfree(bbio); length -= sublen; @@ -1071,7 +1086,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; int ret; - struct scrub_page *page = sblock->pagev + page_num; + struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { @@ -1080,7 +1095,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } - BUG_ON(!page->page); + WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1125,14 +1140,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; - BUG_ON(!sblock->pagev[0].page); + WARN_ON(!sblock->pagev[0]->page); if (is_metadata) { struct btrfs_header *h; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { @@ -1146,7 +1161,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, if (!have_csum) return; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); } for (page_num = 0;;) { @@ -1162,9 +1177,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, page_num++; if (page_num >= sblock->page_count) break; - BUG_ON(!sblock->pagev[page_num].page); + WARN_ON(!sblock->pagev[page_num]->page); - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); } btrfs_csum_final(crc, calculated_csum); @@ -1202,11 +1217,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - struct scrub_page *page_good = sblock_good->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *page_good = sblock_good->pagev[page_num]; - BUG_ON(sblock_bad->pagev[page_num].page == NULL); - BUG_ON(sblock_good->pagev[page_num].page == NULL); + BUG_ON(page_bad->page == NULL); + BUG_ON(page_good->page == NULL); if (force_write || sblock_bad->header_error || sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; @@ -1247,8 +1262,8 @@ static void scrub_checksum(struct scrub_block *sblock) u64 flags; int ret; - BUG_ON(sblock->page_count < 1); - flags = sblock->pagev[0].flags; + WARN_ON(sblock->page_count < 1); + flags = sblock->pagev[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1276,11 +1291,11 @@ static int scrub_checksum_data(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0].have_csum) + if (!sblock->pagev[0]->have_csum) return 0; - on_disk_csum = sblock->pagev[0].csum; - page = sblock->pagev[0].page; + on_disk_csum = sblock->pagev[0]->csum; + page = sblock->pagev[0]->page; buffer = kmap_atomic(page); len = sctx->sectorsize; @@ -1295,8 +1310,8 @@ static int scrub_checksum_data(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; buffer = kmap_atomic(page); } @@ -1326,7 +1341,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; memcpy(on_disk_csum, h->csum, sctx->csum_size); @@ -1337,10 +1352,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) ++fail; - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1365,8 +1380,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1398,15 +1413,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) ++fail_cor; - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1426,8 +1441,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1447,10 +1462,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1468,12 +1483,25 @@ static void scrub_block_put(struct scrub_block *sblock) int i; for (i = 0; i < sblock->page_count; i++) - if (sblock->pagev[i].page) - __free_page(sblock->pagev[i].page); + scrub_page_put(sblock->pagev[i]); kfree(sblock); } } +static void scrub_page_get(struct scrub_page *spage) +{ + atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ + if (atomic_dec_and_test(&spage->ref_count)) { + if (spage->page) + __free_page(spage->page); + kfree(spage); + } +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1577,28 +1605,28 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page later on */ + /* one ref inside this function, plus one for each page added to + * a bio later on */ atomic_set(&sblock->ref_count, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage; u64 l = min_t(u64, len, PAGE_SIZE); - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) { + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - while (index > 0) { - index--; - __free_page(sblock->pagev[index].page); - } - kfree(sblock); + scrub_block_put(sblock); return -ENOMEM; } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + scrub_page_get(spage); + sblock->pagev[index] = spage; spage->sblock = sblock; spage->dev = dev; spage->flags = flags; @@ -1613,14 +1641,17 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, spage->have_csum = 0; } sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; len -= l; logical += l; physical += l; } - BUG_ON(sblock->page_count == 0); + WARN_ON(sblock->page_count == 0); for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage = sblock->pagev[index]; int ret; ret = scrub_add_page_to_bio(sctx, spage); @@ -2289,6 +2320,22 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } + if (fs_info->chunk_root->nodesize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || + fs_info->chunk_root->sectorsize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + /* + * would exhaust the array bounds of pagev member in + * struct scrub_block + */ + pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + fs_info->chunk_root->nodesize, + SCRUB_MAX_PAGES_PER_BLOCK, + fs_info->chunk_root->sectorsize, + SCRUB_MAX_PAGES_PER_BLOCK); + return -EINVAL; + } + ret = scrub_workers_get(root); if (ret) return ret; -- cgit v1.2.1 From cb2ced73d8c7a38b5f699e267deadf2a2cfe911c Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:14:21 +0100 Subject: Btrfs: in scrub repair code, optimize the reading of mirrors In case that disk blocks need to be repaired (rewritten), the current code at first (for simplicity reasons) reads all alternate mirrors in the first step, afterwards selects the best one in a second step. This is now changed to read one alternate mirror after the other and to leave the loop early when a perfect mirror is found. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 15ac82ae5770..7d38f4073243 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -819,26 +819,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (ret) - goto did_not_correct_error; - } - - /* - * first try to pick the mirror which is completely without I/O + * checksums. + * First try to pick the mirror which is completely without I/O * errors and also does not have a checksum error. * If one is found, and if a checksum is present, the full block * that is known to contain an error is rewritten. Afterwards @@ -854,10 +836,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; + struct scrub_block *sblock_other; - if (!sblock_other->header_error && + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + if (!ret && !sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; -- cgit v1.2.1 From 34f5c8e90b3f002672cd6b4e6e7c5b959fd981ae Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:16:26 +0100 Subject: Btrfs: in scrub repair code, simplify alloc error handling In the scrub repair code, the code is changed to handle memory allocation errors a little bit smarter. The change is to handle it just like a read error. This simplifies the code and removes a couple of lines of code, since the code to handle read errors is there anyway. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 61 ++++++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7d38f4073243..fcd5bccaa4ed 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -151,10 +151,10 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -718,16 +718,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_bad = sblocks_for_recheck + failed_mirror_index; /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size); - if (ret) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -843,10 +835,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, - have_csum, csum, generation, - sctx->csum_size); - if (!ret && !sblock_other->header_error && + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + + if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; @@ -931,10 +924,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * is verified, but most likely the data comes out * of the page cache. */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (!ret && !sblock_bad->header_error && + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size); + if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) goto corrected_error; @@ -1061,10 +1054,10 @@ leave_nomem: * to take those pages that are not errored from all the mirrors so that * the pages that are errored in the just handled mirror can be repaired. */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size) +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { int page_num; @@ -1074,7 +1067,6 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; - int ret; struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); @@ -1086,18 +1078,17 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + if (!bio) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } + bio_add_page(bio, page->page, PAGE_SIZE, 0); btrfsic_submit_bio(READ, bio); /* this will also unplug the queue */ @@ -1114,7 +1105,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, have_csum, csum, generation, csum_size); - return 0; + return; } static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, -- cgit v1.2.1 From b6bfebc13218f1fc1502041a810919d3a81b8b4e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:44:58 +0100 Subject: Btrfs: cleanup scrub bio and worker wait code Just move some code into functions to make everything more readable. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 106 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fcd5bccaa4ed..a67b1a17a009 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 STRATO. All rights reserved. + * Copyright (C) 2011, 2012 STRATO. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -104,8 +104,8 @@ struct scrub_ctx { struct btrfs_root *dev_root; int first_free; int curr; - atomic_t in_flight; - atomic_t fixup_cnt; + atomic_t bios_in_flight; + atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -146,6 +146,10 @@ struct scrub_warning { }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, @@ -184,6 +188,59 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sctx->workers_pending); +} + +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); +} + static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -264,8 +321,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx->nodesize = dev->dev_root->nodesize; sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sctx->in_flight, 0); - atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sctx->csum_list); @@ -609,14 +666,7 @@ out: btrfs_free_path(path); kfree(fixup); - /* see caller why we're pretending to be paused in the scrub counters */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sctx->fixup_cnt); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sctx->list_wait); + scrub_pending_trans_workers_dec(sctx); } /* @@ -789,20 +839,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a fixup worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the fixup worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sctx->fixup_cnt); + scrub_pending_trans_workers_inc(sctx); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -1491,7 +1528,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; - atomic_inc(&sctx->in_flight); + scrub_pending_bio_inc(sctx); btrfsic_submit_bio(READ, sbio->bio); } @@ -1692,8 +1729,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); - atomic_dec(&sctx->in_flight); - wake_up(&sctx->list_wait); + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1863,7 +1899,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, logical = base + offset; wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1928,7 +1964,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* push queued extents */ scrub_submit(sctx); wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2218,7 +2254,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (ret) return ret; } - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); return 0; } @@ -2363,11 +2399,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); -- cgit v1.2.1 From beaf8ab3afef27ed81255d9808b67f6d390ca06f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 12 Nov 2012 14:03:45 +0100 Subject: Btrfs: move some common code into a subfunction Some code to open block devices, to read the superblock and to handle errors was repeated multiple times in 3 places, and the following patch makes use of it as well. This code is now moved into a subfunction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 93 +++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 92e586bc8004..4def1fdbf755 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,6 +108,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, + int flush, struct block_device **bdev, + struct buffer_head **bh) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + printk(KERN_INFO "btrfs: open %s failed\n", device_path); + goto error; + } + + if (flush) + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + ret = set_blocksize(*bdev, 4096); + if (ret) { + blkdev_put(*bdev, flags); + goto error; + } + invalidate_bdev(*bdev); + *bh = btrfs_read_dev_super(*bdev); + if (!*bh) { + ret = -EINVAL; + blkdev_put(*bdev, flags); + goto error; + } + + return 0; + +error: + *bdev = NULL; + *bh = NULL; + return ret; +} + static void requeue_list(struct btrfs_pending_bios *pending_bios, struct bio *head, struct bio *tail) { @@ -637,18 +675,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name->str, flags, holder); - if (IS_ERR(bdev)) { - printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); - goto error; - } - filemap_write_and_wait(bdev->bd_inode->i_mapping); - invalidate_bdev(bdev); - set_blocksize(bdev, 4096); - - bh = btrfs_read_dev_super(bdev); - if (!bh) - goto error_close; + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + continue; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -697,9 +727,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); -error_close: blkdev_put(bdev, flags); -error: continue; } if (fs_devices->open_devices == 0) { @@ -744,22 +772,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 total_devices; flags |= FMODE_EXCL; - bdev = blkdev_get_by_path(path, flags, holder); - - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } - mutex_lock(&uuid_mutex); - ret = set_blocksize(bdev, 4096); + ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); if (ret) - goto error_close; - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } + goto error; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); @@ -777,10 +793,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; brelse(bh); -error_close: - mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: + mutex_unlock(&uuid_mutex); return ret; } @@ -1374,20 +1389,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) goto out; - } - - set_blocksize(bdev, 4096); - invalidate_bdev(bdev); - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; -- cgit v1.2.1 From 7ba15b7d211846c187a7c5dc75a5964476f8bc89 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:42:30 +0100 Subject: Btrfs: add two more find_device() methods The new function btrfs_find_device_missing_or_by_path() will be used for the device replace procedure. This function itself calls the second new function btrfs_find_device_by_path(). Unfortunately, it is not possible to currently make the rest of the code use these functions as well, since all functions that look similar at first view are all a little bit different in what they are doing. But in the future, new code could benefit from these two new functions, and currently, device replace uses them. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 5 +++++ 2 files changed, 64 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4def1fdbf755..1483041eb86a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1522,6 +1522,65 @@ error_undo: goto error_brelse; } +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) +{ + int ret = 0; + struct btrfs_super_block *disk_super; + u64 devid; + u8 *dev_uuid; + struct block_device *bdev; + struct buffer_head *bh; + + *device = NULL; + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, + root->fs_info->bdev_holder, 0, &bdev, &bh); + if (ret) + return ret; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + *device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + brelse(bh); + if (!*device) + ret = -ENOENT; + blkdev_put(bdev, FMODE_READ); + return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device) +{ + *device = NULL; + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held by the caller. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + *device = tmp; + break; + } + } + + if (!*device) { + pr_err("btrfs: no missing device found\n"); + return -ENOENT; + } + + return 0; + } else { + return btrfs_find_device_by_path(root, device_path, device); + } +} + /* * does all the dirty work required for changing file system's UUID. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1789cda57efb..657bb12b3069 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,6 +268,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device); +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); -- cgit v1.2.1 From 5d9640517d92d05843711ea982cbeff42d7ed32d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:59:07 +0100 Subject: Btrfs: Pass fs_info to btrfs_num_copies() instead of mapping_tree This is required for the device replace procedure in a later step. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 12 ++++++------ fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 11 +++++------ fs/btrfs/volumes.c | 3 ++- fs/btrfs/volumes.h | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5a3e45db642a..58dfac1359a3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block( *next_blockp = NULL; if (0 == *num_copiesp) { *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data( chunk_len = num_bytes; num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2463,7 +2463,7 @@ static int btrfsic_process_written_superblock( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2960,7 +2960,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, struct btrfsic_block_data_ctx block_ctx; int match = 0; - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(state->root->fs_info, bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ff5d259ac275..ba2b931fd8f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -387,7 +387,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(root->fs_info, eb->start, eb->len); if (num_copies == 1) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3c062c8d1d70..e0b7138909f0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2044,10 +2044,10 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies > 1) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; ret = repair_io_failure(map_tree, start, failrec->len, failrec->logical, page, failrec->failed_mirror); @@ -2157,9 +2157,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * clean_io_failure() clean all those errors at once. */ } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies == 1) { /* * we only have a single copy of the data, so don't bother with diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1483041eb86a..5612767b910e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3785,8 +3785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 657bb12b3069..35ea4424963b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, -- cgit v1.2.1 From 3ec706c831d4c96905c287013c8228b21619a1d9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:46:42 +0100 Subject: Btrfs: pass fs_info to btrfs_map_block() instead of mapping_tree This is required for the device replace procedure in a later step. Two calling functions also had to be changed to have the fs_info pointer: repair_io_failure() and scrub_setup_recheck_block(). Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 19 +++++++++---------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 12 +++++------- fs/btrfs/reada.c | 3 +-- fs/btrfs/scrub.c | 14 +++++++------- fs/btrfs/volumes.c | 11 +++++------ fs/btrfs/volumes.h | 2 +- 9 files changed, 32 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 58dfac1359a3..8f9abedae2c3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1582,7 +1582,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); device = multi->stripes[0].dev; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f8a358aee060..b4d438f6c2b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1818,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e0b7138909f0..62ec6e45f705 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1917,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err) * the standard behavior is to write all copies in a raid setup. here we only * want to write the one bad copy. so we do the mapping for ourselves and issue * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which * actually prevents the read that triggered the error from finishing. * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num) { @@ -1944,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, bio->bi_size = 0; map_length = length; - ret = btrfs_map_block(map_tree, WRITE, logical, + ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { bio_put(bio); @@ -1982,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, start, p, mirror_num); if (ret) break; @@ -2008,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; struct extent_state *state; int num_copies; int did_repair = 0; @@ -2044,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, - failrec->logical, failrec->len); + fs_info = BTRFS_I(inode)->root->fs_info; + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); if (num_copies > 1) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, start, failrec->len, + ret = repair_io_failure(fs_info, start, failrec->len, failrec->logical, page, failrec->failed_mirror); did_repair = !ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 711d12b80028..2eacfabd3263 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -337,9 +337,9 @@ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); -struct btrfs_mapping_tree; +struct btrfs_fs_info; -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aabf747d056e..5d1675a8c9e2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1549,7 +1549,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_mapping_tree *map_tree; u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; @@ -1559,11 +1558,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, READ, logical, + ret = btrfs_map_block(root->fs_info, READ, logical, &map_length, NULL, 0); - /* Will always return 0 or 1 with map_multi == NULL */ + /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); if (map_length < length + size) return 1; @@ -6364,7 +6362,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; @@ -6377,7 +6374,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); @@ -6431,7 +6428,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, + start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(bio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a955669519a2..0ddc5659f946 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -323,7 +323,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; @@ -358,7 +357,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a67b1a17a009..894bb2732fcc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -152,7 +152,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblock); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -523,7 +523,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) } if (PageUptodate(page)) { - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -544,8 +544,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) ret = -EIO; goto out; } - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fs_info = BTRFS_I(inode)->root->fs_info; + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, fixup->logical, page, fixup->mirror_num); unlock_page(page); @@ -754,7 +754,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, fs_info, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -1012,7 +1012,7 @@ out: } static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1036,7 +1036,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5612767b910e..96bb2e4446aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3826,13 +3826,14 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, return optimal; } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; @@ -4061,11 +4062,11 @@ out: return ret; } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, mirror_num); } @@ -4394,7 +4395,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { - struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; struct bio *first_bio = bio; u64 logical = (u64)bio->bi_sector << 9; @@ -4406,10 +4406,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, struct btrfs_bio *bbio = NULL; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); if (ret) /* -ENOMEM */ return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 35ea4424963b..ad5566d4f2c8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -248,7 +248,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, -- cgit v1.2.1 From a8a6dab77997a371f1925a4001021eea3ee5cb88 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:50:14 +0100 Subject: Btrfs: add btrfs_scratch_superblock() function This new function is used by the device replace procedure in a later patch. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 18 ++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96bb2e4446aa..6cd8a32c4484 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5106,3 +5106,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root, stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; return 0; } + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + + bh = btrfs_read_dev_super(device->bdev); + if (!bh) + return -EINVAL; + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ad5566d4f2c8..7eaaf4e61959 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -301,6 +301,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) -- cgit v1.2.1 From aa1b8cd409f05e1489ec77ff219eff6ed4b801b8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:03:39 +0100 Subject: Btrfs: pass fs_info instead of root A small number of functions that are used in a device replace procedure when the operation is resumed at mount time are unable to pass the same root pointer that would be used in the regular (ioctl) context. And since the root pointer is not required, only the fs_info is, the root pointer argument is replaced with the fs_info pointer argument. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 11 ++++---- fs/btrfs/disk-io.c | 4 +-- fs/btrfs/ioctl.c | 8 +++--- fs/btrfs/scrub.c | 76 ++++++++++++++++++++++++------------------------------ fs/btrfs/super.c | 2 +- fs/btrfs/volumes.c | 23 +++++++++-------- fs/btrfs/volumes.h | 2 +- 7 files changed, 60 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9a078661ebc..f8bb62c82b0c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3540,15 +3540,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ba2b931fd8f6..42a8024e935f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3283,9 +3283,9 @@ int close_ctree(struct btrfs_root *root) smp_mb(); /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); + btrfs_pause_balance(fs_info); - btrfs_scrub_cancel(root); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e262cd8c4a7d..b40b827f93e7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1343,7 +1343,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } - device = btrfs_find_device(root, devid, NULL, NULL); + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); @@ -2332,7 +2332,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -3089,7 +3089,7 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); if (copy_to_user(arg, sa, sizeof(*sa))) @@ -3104,7 +3104,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_scrub_cancel(root); + return btrfs_scrub_cancel(root->fs_info); } static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 894bb2732fcc..6cf23f4f7bb7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2262,9 +2262,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; mutex_lock(&fs_info->scrub_lock); @@ -2283,10 +2282,8 @@ out: return ret; } -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) btrfs_stop_workers(&fs_info->scrub_workers); @@ -2294,29 +2291,29 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) mutex_unlock(&fs_info->scrub_lock); } - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly) +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly) { struct scrub_ctx *sctx; - struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; - if (btrfs_fs_closing(root->fs_info)) + if (btrfs_fs_closing(fs_info)) return -EINVAL; /* * check some assumptions */ - if (root->nodesize != root->leafsize) { + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { printk(KERN_ERR "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", - root->nodesize, root->leafsize); + fs_info->chunk_root->nodesize, + fs_info->chunk_root->leafsize); return -EINVAL; } - if (root->nodesize > BTRFS_STRIPE_LEN) { + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum * the way scrub is implemented. Do not handle this @@ -2324,15 +2321,16 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, */ printk(KERN_ERR "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", - root->nodesize, BTRFS_STRIPE_LEN); + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } - if (root->sectorsize != PAGE_SIZE) { + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ printk(KERN_ERR "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - root->sectorsize, (unsigned long long)PAGE_SIZE); + fs_info->chunk_root->sectorsize, + (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -2352,37 +2350,37 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } - ret = scrub_workers_get(root); + ret = scrub_workers_get(fs_info); if (ret) return ret; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || dev->missing) { - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -ENODEV; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); + return -EIO; } if (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -EINPROGRESS; } sctx = scrub_setup_ctx(dev); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return PTR_ERR(sctx); } sctx->readonly = readonly; @@ -2390,7 +2388,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); ret = scrub_supers(sctx, dev); @@ -2413,7 +2411,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(root); + scrub_workers_put(fs_info); return ret; } @@ -2453,9 +2451,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root) up_write(&root->fs_info->scrub_super_lock); } -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); @@ -2475,14 +2472,9 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_scrub_cancel(struct btrfs_root *root) +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) { - return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = root->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); @@ -2514,12 +2506,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) * does not go away in cancel_dev. FIXME: find a better solution */ mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -ENODEV; } - ret = btrfs_scrub_cancel_dev(root, dev); + ret = btrfs_scrub_cancel_dev(fs_info, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return ret; @@ -2532,7 +2524,7 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (dev) sctx = dev->scrub_device; if (sctx) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index acd2df85bed5..a1a6c296ddcd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - __btrfs_scrub_cancel(fs_info); + btrfs_scrub_cancel(fs_info); // WARN_ON(1); } } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cd8a32c4484..d2c0bccca607 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1398,7 +1398,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root, devid, dev_uuid, + device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); if (!device) { ret = -ENOENT; @@ -1435,7 +1435,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) spin_unlock(&root->fs_info->free_chunk_lock); device->in_fs_metadata = 0; - btrfs_scrub_cancel_dev(root, device); + btrfs_scrub_cancel_dev(root->fs_info, device); /* * the device list mutex makes sure that we don't change @@ -1492,7 +1492,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super) { + if (clear_super && disk_super) { /* make sure this device isn't detected as part of * the FS anymore */ @@ -1540,7 +1540,7 @@ int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(root, devid, dev_uuid, + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); if (!*device) @@ -1699,7 +1699,8 @@ next_slot: read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -4463,13 +4464,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, return 0; } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - cur_devices = root->fs_info->fs_devices; + cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -4567,8 +4568,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, - NULL); + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { kfree(map); free_extent_map(em); @@ -4686,7 +4687,7 @@ static int read_one_dev(struct btrfs_root *root, return ret; } - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device || !device->bdev) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -5078,7 +5079,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, stats->devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7eaaf4e61959..802e2ba02f09 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,7 +281,7 @@ void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -- cgit v1.2.1 From 1acd6831d98779c88cd57f0a5826d6df0b09f3fa Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:11:06 +0100 Subject: Btrfs: avoid risk of a deadlock in btrfs_handle_error Remove the attempt to cancel a running scrub or device replace operation in btrfs_handle_error() because it adds the risk of a deadlock. The only penalty of not canceling the operation is that some I/O remains active until the procedure completes. This is basically the same thing that happens to other tasks that are running in user mode context, they are not affected or stopped in btrfs_handle_error(), these tasks just need to handle write errors correctly. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1a6c296ddcd..ef2415896b06 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - btrfs_scrub_cancel(fs_info); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ // WARN_ON(1); } } -- cgit v1.2.1 From e922e087a35c437acef3bc88ce31e59c699c38bd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:26:40 +0100 Subject: Btrfs: enhance btrfs structures for device replace support Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 39 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.c | 5 +++++ 2 files changed, 44 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f8bb62c82b0c..0781fd4a5c1a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -885,6 +885,42 @@ struct btrfs_dev_stats_item { __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; } __attribute__ ((__packed__)); +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1471,6 +1507,9 @@ struct btrfs_fs_info { int backup_root_index; int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42a8024e935f..9d1b71060813 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2131,6 +2131,11 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); spin_lock_init(&fs_info->qgroup_lock); fs_info->qgroup_tree = RB_ROOT; -- cgit v1.2.1 From a2bff64025d7a707ac49155bb6678a636e55096e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:32:20 +0100 Subject: Btrfs: introduce a btrfs_dev_replace_item type Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/print-tree.c | 3 +++ 2 files changed, 69 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0781fd4a5c1a..147406d0f9a9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -921,6 +921,23 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1762,6 +1779,12 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_DEV_STATS_KEY 249 +/* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2795,6 +2818,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5e23684887eb..50d95fd190a5 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; + case BTRFS_DEV_REPLACE_KEY: + printk(KERN_INFO "\t\tdev replace\n"); + break; }; } } -- cgit v1.2.1 From 5ac00addc7ac09110995fe967071d191b5981cc1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:54:08 +0100 Subject: Btrfs: disallow mutually exclusive admin operations from user mode Btrfs admin operations that are manually started from user mode and that cannot be executed at the same time return -EINPROGRESS. A common way to enter and leave this locked section is introduced since it used to be specific to the balance operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/ioctl.c | 53 ++++++++++++++++++++++++++++++++++++----------------- fs/btrfs/volumes.c | 2 ++ 3 files changed, 40 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 147406d0f9a9..e9dc78014f09 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1527,6 +1527,8 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b40b827f93e7..26f46dad3b0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1317,13 +1317,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1419,6 +1419,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2160,9 +2161,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; + } ret = mnt_want_write_file(file); - if (ret) + if (ret) { + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); return ret; + } switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -2214,6 +2223,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) } out: mnt_drop_write_file(file); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2225,13 +2235,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2244,6 +2254,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2258,13 +2269,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2277,6 +2288,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3319,6 +3331,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; int ret; + int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3354,10 +3367,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (fs_info->balance_ctl) { + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); ret = -EINPROGRESS; goto out_bargs; } + need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3391,6 +3407,9 @@ do_balance: out_bargs: kfree(bargs); out: + if (need_to_clear_lock) + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); mnt_drop_write_file(file); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2c0bccca607..33ca36b37a6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2952,6 +2952,7 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -2974,6 +2975,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); -- cgit v1.2.1 From 63a212abc2315972b245f93cb11ae3acf3c0b513 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:29:28 +0100 Subject: Btrfs: disallow some operations on the device replace target device This patch adds some code to disallow operations on the device that is used as the target for the device replace operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/ioctl.c | 8 +++++++- fs/btrfs/scrub.c | 14 +++++++++----- fs/btrfs/super.c | 3 ++- fs/btrfs/volumes.c | 41 ++++++++++++++++++++++++++++++++--------- fs/btrfs/volumes.h | 1 + 7 files changed, 54 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e9dc78014f09..746cb6aa1f62 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3649,7 +3649,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly); + int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b4d438f6c2b3..98af8379895a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7468,7 +7468,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 26f46dad3b0e..e54b5e50c927 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1375,6 +1375,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } } + if (device->is_tgtdev_for_dev_replace) { + ret = -EINVAL; + goto out_free; + } + old_size = device->total_bytes; if (mod < 0) { @@ -3102,7 +3107,8 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) return PTR_ERR(sa); ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6cf23f4f7bb7..460e30bb1884 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -116,6 +116,9 @@ struct scrub_ctx { u32 sectorsize; u32 nodesize; u32 leafsize; + + int is_dev_replace; + /* * statistics */ @@ -284,7 +287,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) } static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -296,6 +299,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + sctx->is_dev_replace = is_dev_replace; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; @@ -2293,7 +2297,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly) + int readonly, int is_dev_replace) { struct scrub_ctx *sctx; int ret; @@ -2356,14 +2360,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev || dev->missing) { + if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); - if (!dev->in_fs_metadata) { + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); @@ -2376,7 +2380,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); return -EINPROGRESS; } - sctx = scrub_setup_ctx(dev); + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef2415896b06..837ad2d27853 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1354,7 +1354,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) min_stripe_size = BTRFS_STRIPE_LEN; list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev) + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) continue; avail_space = device->total_bytes - device->bytes_used; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 33ca36b37a6a..31f7af878d96 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -518,8 +518,9 @@ again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) { - if (!latest_transid || - device->generation > latest_transid) { + if (!device->is_tgtdev_for_dev_replace && + (!latest_transid || + device->generation > latest_transid)) { latest_devid = device->devid; latest_transid = device->generation; latest_bdev = device->bdev; @@ -814,7 +815,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes) + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) return 0; path = btrfs_alloc_path(); @@ -931,7 +932,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, max_hole_size = 0; hole_size = 0; - if (search_start >= search_end) { + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto error; } @@ -1114,6 +1115,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; WARN_ON(!device->in_fs_metadata); + WARN_ON(device->is_tgtdev_for_dev_replace); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1375,7 +1377,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * is held. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (tmp->in_fs_metadata && + !tmp->is_tgtdev_for_dev_replace && + !tmp->bdev) { device = tmp; break; } @@ -1406,6 +1410,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } + if (device->is_tgtdev_for_dev_replace) { + pr_err("btrfs: unable to remove the dev_replace target dev\n"); + ret = -EINVAL; + goto error_brelse; + } + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { printk(KERN_ERR "btrfs: unable to remove the only writeable " "device\n"); @@ -1425,6 +1435,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + /* + * TODO: the superblock still includes this device in its num_devices + * counter although write_all_supers() is not locked out. This + * could give a filesystem state which requires a degraded mount. + */ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) goto error_undo; @@ -1808,6 +1823,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); @@ -1971,7 +1987,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; - if (new_size <= device->total_bytes) + if (new_size <= device->total_bytes || + device->is_tgtdev_for_dev_replace) return -EINVAL; btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -2600,7 +2617,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free) + device->total_bytes - device->bytes_used > size_to_free || + device->is_tgtdev_for_dev_replace) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3132,6 +3150,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; + if (device->is_tgtdev_for_dev_replace) + return -EINVAL; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3401,7 +3422,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, continue; } - if (!device->in_fs_metadata) + if (!device->in_fs_metadata || + device->is_tgtdev_for_dev_replace) continue; if (device->total_bytes > device->bytes_used) @@ -4612,6 +4634,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -4722,7 +4745,7 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; spin_lock(&root->fs_info->free_chunk_lock); root->fs_info->free_chunk_space += device->total_bytes - diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 802e2ba02f09..8fd5a4d8acc8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -50,6 +50,7 @@ struct btrfs_device { int in_fs_metadata; int missing; int can_discard; + int is_tgtdev_for_dev_replace; spinlock_t io_lock; -- cgit v1.2.1 From 618919236ba54361e93106f4951d233a7ade63cd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:51:52 +0100 Subject: Btrfs: handle errors from btrfs_map_bio() everywhere With the addition of the device replace procedure, it is possible for btrfs_map_bio(READ) to report an error. This happens when the specific mirror is requested which is located on the target disk, and the copy operation has not yet copied this block. Hence the block cannot be read and this error state is indicated by returning EIO. Some background information follows now. A new mirror is added while the device replace procedure is running. btrfs_get_num_copies() returns one more, and btrfs_map_bio(GET_READ_MIRROR) adds one more mirror if a disk location is involved that was already handled by the device replace copy operation. The assigned mirror num is the highest mirror number, e.g. the value 3 in case of RAID1. If btrfs_map_bio() is invoked with mirror_num == 0 (i.e., select any mirror), the copy on the target drive is never selected because that disk shall be able to perform the write requests as quickly as possible. The parallel execution of read requests would only slow down the disk copy procedure. Second case is that btrfs_map_bio() is called with mirror_num > 0. This is done from the repair code only. In this case, the highest mirror num is assigned to the target disk, since it is used last. And when this mirror is not available because the copy procedure has not yet handled this area, an error is returned. Everywhere in the code the handling of such errors is added now. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 15 +++++++++++++-- fs/btrfs/compression.c | 6 ++++-- fs/btrfs/disk-io.c | 44 +++++++++++++++++++++++++++----------------- fs/btrfs/extent_io.c | 4 ---- fs/btrfs/inode.c | 27 ++++++++++++++++++++------- fs/btrfs/volumes.c | 2 +- 6 files changed, 65 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 8f9abedae2c3..badc6f141b6f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1585,6 +1585,18 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + device = multi->stripes[0].dev; block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; @@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - if (0 == ret) - kfree(multi); + kfree(multi); if (NULL == block_ctx_out->dev) { ret = -ENXIO; printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6467aa88bee..94ab2f80e7e3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); @@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d1b71060813..0e410478ad27 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -852,11 +852,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int ret; + /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } static int check_async_write(struct inode *inode, unsigned long bio_flags) @@ -878,7 +883,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret; if (!(rw & REQ_WRITE)) { - /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -886,26 +890,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); } - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - bio_offset, - __btree_submit_bio_start, - __btree_submit_bio_done); + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; } #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 62ec6e45f705..1b319df29eee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2462,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */ static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5d1675a8c9e2..d7bf2e7ee8a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1602,7 +1602,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; - return btrfs_map_bio(root, rw, bio, mirror_num, 1); + int ret; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } /* @@ -1626,15 +1631,17 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) - return ret; + goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { - return btrfs_submit_compressed_read(inode, bio, - mirror_num, bio_flags); + ret = btrfs_submit_compressed_read(inode, bio, + mirror_num, + bio_flags); + goto out; } else if (!skip_sum) { ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); if (ret) - return ret; + goto out; } goto mapit; } else if (!skip_sum) { @@ -1642,15 +1649,21 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); + goto out; } mapit: - return btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: + if (ret < 0) + bio_endio(bio, ret); + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 31f7af878d96..415862885b67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4435,7 +4435,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); - if (ret) /* -ENOMEM */ + if (ret) return ret; total_devs = bbio->num_stripes; -- cgit v1.2.1 From ff023aac31198e88507d626825379b28ea481d4d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 11:43:11 +0100 Subject: Btrfs: add code to scrub to copy read data to another disk The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit adds code to scrub to allow the scrub code to copy read data to another disk. One goal is to be able to perform as fast as possible. Therefore the write requests are collected until huge bios are built, and the write process is decoupled from the read process with some kind of flow control, of course, in order to limit the allocated memory. The best performance on spinning disks could by reached when the head movements are avoided as much as possible. Therefore a single worker is used to interface the read process with the write process. The regular scrub operation works as fast as before, it is not negatively influenced and actually it is more or less unchanged. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 + fs/btrfs/dev-replace.h | 26 ++ fs/btrfs/reada.c | 10 +- fs/btrfs/scrub.c | 883 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/super.c | 3 +- 5 files changed, 851 insertions(+), 73 deletions(-) create mode 100644 fs/btrfs/dev-replace.h (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746cb6aa1f62..ded7caa0d304 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1483,6 +1483,8 @@ struct btrfs_fs_info { struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; struct btrfs_workers scrub_workers; + struct btrfs_workers scrub_wr_completion_workers; + struct btrfs_workers scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 000000000000..1fb5c89037ee --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0ddc5659f946..9f363e17ec74 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } + if (!dev->bdev) { + /* cannot read ahead on missing device */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { dev = bbio->stripes[i].dev; BUG_ON(dev == NULL); + /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); } BUG_ON(fs_info == NULL); @@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - reada_add_block(rc, start, &max_key, level, generation); + if (reada_add_block(rc, start, &max_key, level, generation)) { + kfree(rc); + return ERR_PTR(-ENOMEM); + } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 460e30bb1884..61157a26cf2a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" @@ -44,8 +45,15 @@ struct scrub_block; struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ /* * the following value times PAGE_SIZE needs to be large enough to match the @@ -62,6 +70,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + u64 physical_for_dev_replace; atomic_t ref_count; struct { unsigned int mirror_num:8; @@ -79,7 +88,11 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif int page_count; int next_free; struct btrfs_work work; @@ -99,8 +112,16 @@ struct scrub_block { }; }; +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + struct scrub_ctx { - struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_root *dev_root; int first_free; int curr; @@ -112,12 +133,13 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + int pages_per_rd_bio; u32 sectorsize; u32 nodesize; u32 leafsize; int is_dev_replace; + struct scrub_wr_ctx wr_ctx; /* * statistics @@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum { int mirror_num; }; +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct btrfs_work work; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, - struct scrub_block *sblock); + struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, @@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); @@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force); + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; + scrub_free_wr_ctx(&sctx->wr_ctx); + /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); + WARN_ON(!sbio->pagev[i]->page); scrub_block_put(sbio->pagev[i]->sblock); } bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) @@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; + int pages_per_rd_bio; + int ret; - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = pages_per_bio; + sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_CTX - 1) + if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; else sctx->bios[i]->next_free = -1; @@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } return sctx; nomem: @@ -341,7 +419,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) { u64 isize; u32 nlink; @@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) int i; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct scrub_warning *swarn = ctx; + struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; @@ -492,11 +571,11 @@ out: kfree(swarn.msg_buf); } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) { struct page *page = NULL; unsigned long index; - struct scrub_fixup_nodatasum *fixup = ctx; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; int ret; int corrected = 0; struct btrfs_key key; @@ -660,7 +739,9 @@ out: spin_lock(&sctx->stat_lock); ++sctx->stat.uncorrectable_errors; spin_unlock(&sctx->stat_lock); - + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, @@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) csum = sblock_to_check->pagev[0]->csum; dev = sblock_to_check->pagev[0]->dev; + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } + /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, length, + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.unverified_errors++; spin_unlock(&sctx->stat_lock); + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); goto out; } @@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly) + if (sctx->readonly && !sctx->is_dev_replace) goto did_not_correct_error; if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: + WARN_ON(sctx->is_dev_replace); + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + } else { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other, + force_write); + } if (0 == ret) goto corrected_error; } } /* - * in case of I/O errors in the area that is supposed to be + * for dev_replace, pick good pages and write to the target device. + */ + if (sctx->is_dev_replace) { + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + int sub_success; + + sub_success = 0; + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = + sblocks_for_recheck + mirror_index; + struct scrub_page *page_other = + sblock_other->pagev[page_num]; + + if (!page_other->io_error) { + ret = scrub_write_page_to_dev_replace( + sblock_other, page_num); + if (ret == 0) { + /* succeeded for this page */ + sub_success = 1; + break; + } else { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + } + } + } + + if (!sub_success) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + success = 0; + ret = scrub_write_page_to_dev_replace( + sblock_bad, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + } + } + + goto out; + } + + /* + * for regular scrub, repair those pages that are errored. + * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from * the area to fix. Afterwards verify the checksum of the block @@ -1017,6 +1169,7 @@ out: static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, return -EIO; } - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; mirror_index++) { struct scrub_block *sblock; @@ -1071,6 +1224,10 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; + BUG_ON(page_index >= original_sblock->page_count); + page->physical_for_dev_replace = + original_sblock->pagev[page_index]-> + physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; @@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; DECLARE_COMPLETION_ONSTACK(complete); + if (!page_bad->dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio_flagged(bio, BIO_UPTODATE)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_replace_stats_inc( + &sblock_bad->sctx->dev_root->fs_info-> + dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1278,7 +1444,168 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return 0; } -static void scrub_checksum(struct scrub_block *sblock) +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ + int page_num; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + int ret; + + ret = scrub_write_page_to_dev_replace(sblock, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sblock->sctx->dev_root->fs_info->dev_replace. + num_write_errors); + } +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num) +{ + struct scrub_page *spage = sblock->pagev[page_num]; + + BUG_ON(spage->page == NULL); + if (spage->io_error) { + void *mapped_buffer = kmap_atomic(spage->page); + + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + flush_dcache_page(spage->page); + kunmap_atomic(mapped_buffer); + } + return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + int ret; + + mutex_lock(&wr_ctx->wr_lock); +again: + if (!wr_ctx->wr_curr_bio) { + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + GFP_NOFS); + if (!wr_ctx->wr_curr_bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + wr_ctx->wr_curr_bio->sctx = sctx; + wr_ctx->wr_curr_bio->page_count = 0; + } + sbio = wr_ctx->wr_curr_bio; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical_for_dev_replace; + sbio->logical = spage->logical; + sbio->dev = wr_ctx->tgtdev; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + if (!bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_wr_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical_for_dev_replace || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_wr_submit(sctx); + goto again; + } + + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); + return -EIO; + } + scrub_wr_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + scrub_page_get(spage); + sbio->page_count++; + if (sbio->page_count == wr_ctx->pages_per_wr_bio) + scrub_wr_submit(sctx); + mutex_unlock(&wr_ctx->wr_lock); + + return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + + if (!wr_ctx->wr_curr_bio) + return; + + sbio = wr_ctx->wr_curr_bio; + wr_ctx->wr_curr_bio = NULL; + WARN_ON(!sbio->bio->bi_bdev); + scrub_pending_bio_inc(sctx); + /* process all writes in a single worker thread. Then the block layer + * orders the requests before sending them to the driver which + * doubled the write performance on spinning disks when measured + * with Linux 3.5 */ + btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + sbio->work.func = scrub_wr_bio_end_io_worker; + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + if (sbio->err) { + struct btrfs_dev_replace *dev_replace = + &sbio->sctx->dev_root->fs_info->dev_replace; + + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + btrfs_dev_replace_stats_inc(&dev_replace-> + num_write_errors); + } + } + + for (i = 0; i < sbio->page_count; i++) + scrub_page_put(sbio->pagev[i]); + + bio_put(sbio->bio); + kfree(sbio); + scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock) { u64 flags; int ret; @@ -1296,6 +1623,8 @@ static void scrub_checksum(struct scrub_block *sblock) WARN_ON(1); if (ret) scrub_handle_errored_block(sblock); + + return ret; } static int scrub_checksum_data(struct scrub_block *sblock) @@ -1386,7 +1715,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -1534,11 +1863,24 @@ static void scrub_submit(struct scrub_ctx *sctx) sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + if (!sbio->bio->bi_bdev) { + /* + * this case should not happen. If btrfs_map_block() is + * wrong, it could happen for dev-replace operations on + * missing devices when no mirrors are available, but in + * this case it should already fail the mount. + * This case is handled correctly (but _very_ slowly). + */ + printk_ratelimited(KERN_WARNING + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + bio_endio(sbio->bio, -EIO); + } else { + btrfsic_submit_bio(READ, sbio->bio); + } } -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; @@ -1570,7 +1912,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1602,10 +1944,10 @@ again: goto again; } - scrub_block_get(sblock); /* one for the added page */ + scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + if (sbio->page_count == sctx->pages_per_rd_bio) scrub_submit(sctx); return 0; @@ -1613,7 +1955,8 @@ again: static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force) + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace) { struct scrub_block *sblock; int index; @@ -1654,6 +1997,7 @@ leave_nomem: spage->generation = gen; spage->logical = logical; spage->physical = physical; + spage->physical_for_dev_replace = physical_for_dev_replace; spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; @@ -1668,6 +2012,7 @@ leave_nomem: len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } WARN_ON(sblock->page_count == 0); @@ -1675,7 +2020,7 @@ leave_nomem: struct scrub_page *spage = sblock->pagev[index]; int ret; - ret = scrub_add_page_to_bio(sctx, spage); + ret = scrub_add_page_to_rd_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1707,7 +2052,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); if (sbio->err) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -1733,15 +2078,30 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) { - if (!sblock->no_io_error_seen) + if (!sblock->no_io_error_seen) { scrub_handle_errored_block(sblock); - else - scrub_checksum(sblock); + } else { + /* + * if has checksum error, write via repair mechanism in + * dev replace case, otherwise write here in dev replace + * case. + */ + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -1786,7 +2146,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num) + u64 gen, int mirror_num, u64 physical_for_dev_replace) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1799,7 +2159,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -1807,7 +2167,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, spin_unlock(&sctx->stat_lock); } else { blocksize = sctx->sectorsize; - BUG_ON(1); + WARN_ON(1); } while (len) { @@ -1819,14 +2179,23 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) ++sctx->stat.no_csum; + if (sctx->is_dev_replace && !have_csum) { + ret = copy_nocow_pages(sctx, logical, l, + mirror_num, + physical_for_dev_replace); + goto behind_scrub_pages; + } } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + mirror_num, have_csum ? csum : NULL, 0, + physical_for_dev_replace); +behind_scrub_pages: if (ret) return ret; len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } return 0; } @@ -1834,7 +2203,8 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length) + int num, u64 base, u64 length, + int is_dev_replace) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; @@ -1859,6 +2229,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + int extent_mirror_num; nstripes = length; offset = 0; @@ -1966,9 +2341,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2063,10 +2443,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key.objectid; } - ret = scrub_extent(sctx, key.objectid, key.offset, - key.objectid - logical + physical, - scrub_dev, flags, generation, - mirror_num); + extent_logical = key.objectid; + extent_physical = key.objectid - logical + physical; + extent_len = key.offset; + extent_dev = scrub_dev; + extent_mirror_num = mirror_num; + if (is_dev_replace) + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + ret = scrub_extent(sctx, extent_logical, extent_len, + extent_physical, extent_dev, flags, + generation, extent_mirror_num, + key.objectid - logical + physical); if (ret) goto out; @@ -2080,10 +2470,13 @@ next: sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); } +out: /* push queued extents */ scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); -out: blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; @@ -2093,14 +2486,14 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + u64 dev_offset, int is_dev_replace) { struct btrfs_mapping_tree *map_tree = &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; - int ret = -EINVAL; + int ret = 0; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); @@ -2120,7 +2513,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length); + chunk_offset, length, + is_dev_replace); if (ret) goto out; } @@ -2133,7 +2527,8 @@ out: static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end) + struct btrfs_device *scrub_dev, u64 start, u64 end, + int is_dev_replace) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -2149,6 +2544,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_block_group_cache *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); if (!path) @@ -2214,11 +2610,61 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOENT; break; } + dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset); + chunk_offset, length, found_key.offset, + is_dev_replace); + + /* + * flush, submit all pending read and write bios, afterwards + * wait for them. + * Note that in the dev replace case, a read request causes + * write requests that are submitted in the read completion + * worker. Therefore in the current situation, it is required + * that all write requests are flushed, so that all read and + * write requests are really completed when bios_in_flight + * changes to 0. + */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + wait_event(sctx->list_wait, + atomic_read(&sctx->workers_pending) == 0); + + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; + if (atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } key.offset = found_key.offset + length; btrfs_release_path(path); @@ -2254,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, 1); + NULL, 1, bytenr); if (ret) return ret; } @@ -2266,18 +2712,38 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + int is_dev_replace) { int ret = 0; mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); + if (is_dev_replace) + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, + &fs_info->generic_worker); + else + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, + &fs_info->generic_worker); fs_info->scrub_workers.idle_thresh = 4; ret = btrfs_start_workers(&fs_info->scrub_workers); if (ret) goto out; + btrfs_init_workers(&fs_info->scrub_wr_completion_workers, + "scrubwrc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + fs_info->scrub_wr_completion_workers.idle_thresh = 2; + ret = btrfs_start_workers( + &fs_info->scrub_wr_completion_workers); + if (ret) + goto out; + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, + &fs_info->generic_worker); + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); + if (ret) + goto out; } ++fs_info->scrub_workers_refcnt; out: @@ -2289,8 +2755,11 @@ out: static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); - if (--fs_info->scrub_workers_refcnt == 0) + if (--fs_info->scrub_workers_refcnt == 0) { btrfs_stop_workers(&fs_info->scrub_workers); + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); + btrfs_stop_workers(&fs_info->scrub_nocow_workers); + } WARN_ON(fs_info->scrub_workers_refcnt < 0); mutex_unlock(&fs_info->scrub_lock); } @@ -2354,7 +2823,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info); + ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) return ret; @@ -2394,12 +2863,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); + if (!is_dev_replace) { + down_read(&fs_info->scrub_super_lock); + ret = scrub_supers(sctx, dev); + up_read(&fs_info->scrub_super_lock); + } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end, + is_dev_replace); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -2537,3 +3009,272 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) +{ + u64 mapped_length; + struct btrfs_bio *bbio = NULL; + int ret; + + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < extent_len || + !bbio->stripes[0].dev->bdev) { + kfree(bbio); + return; + } + + *extent_physical = bbio->stripes[0].physical; + *extent_mirror_num = bbio->mirror_num; + *extent_dev = bbio->stripes[0].dev; + kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace) +{ + WARN_ON(wr_ctx->wr_curr_bio != NULL); + + mutex_init(&wr_ctx->wr_lock); + wr_ctx->wr_curr_bio = NULL; + if (!is_dev_replace) + return 0; + + WARN_ON(!dev->bdev); + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, + bio_get_nr_vecs(dev->bdev)); + wr_ctx->tgtdev = dev; + atomic_set(&wr_ctx->flush_all_writes, 0); + return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ + mutex_lock(&wr_ctx->wr_lock); + kfree(wr_ctx->wr_curr_bio); + wr_ctx->wr_curr_bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace) +{ + struct scrub_copy_nocow_ctx *nocow_ctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); + if (!nocow_ctx) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + scrub_pending_trans_workers_inc(sctx); + + nocow_ctx->sctx = sctx; + nocow_ctx->logical = logical; + nocow_ctx->len = len; + nocow_ctx->mirror_num = mirror_num; + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; + nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_queue_worker(&fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = + container_of(work, struct scrub_copy_nocow_ctx, work); + struct scrub_ctx *sctx = nocow_ctx->sctx; + u64 logical = nocow_ctx->logical; + u64 len = nocow_ctx->len; + int mirror_num = nocow_ctx->mirror_num; + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_root *root; + int not_written = 0; + + fs_info = sctx->dev_root->fs_info; + root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + not_written = 1; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + not_written = 1; + goto out; + } + + ret = iterate_inodes_from_logical(logical, fs_info, path, + copy_nocow_pages_for_inode, + nocow_ctx); + if (ret != 0 && ret != -ENOENT) { + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", + (unsigned long long)logical, + (unsigned long long)physical_for_dev_replace, + (unsigned long long)len, + (unsigned long long)mirror_num, ret); + not_written = 1; + goto out; + } + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, root); + if (not_written) + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. + num_uncorrectable_read_errors); + + btrfs_free_path(path); + kfree(nocow_ctx); + + scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + unsigned long index; + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + int ret = 0; + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_root *local_root; + u64 physical_for_dev_replace; + u64 len; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + len = nocow_ctx->len; + while (len >= PAGE_CACHE_SIZE) { + struct page *page = NULL; + int ret_sub; + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + pr_err("find_or_create_page() failed\n"); + ret = -ENOMEM; + goto next_page; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) + goto next_page; + } else { + ClearPageError(page); + ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + io_tree, + page, btrfs_get_extent, + nocow_ctx->mirror_num); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto next_page; + } + } + ret_sub = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + +next_page: + if (page) { + unlock_page(page); + put_page(page); + } + offset += PAGE_CACHE_SIZE; + physical_for_dev_replace += PAGE_CACHE_SIZE; + len -= PAGE_CACHE_SIZE; + } + + if (inode) + iput(inode); + return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page) +{ + struct bio *bio; + struct btrfs_device *dev; + int ret; + DECLARE_COMPLETION_ONSTACK(compl); + + dev = sctx->wr_ctx.tgtdev; + if (!dev) + return -EIO; + if (!dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + bio->bi_private = &compl; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_size = 0; + bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_bdev = dev->bdev; + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + btrfsic_submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + goto leave_with_eio; + + bio_put(bio); + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 837ad2d27853..ad4380684b9b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, + new_pool_size); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) -- cgit v1.2.1 From e93c89c1aaaaaec3487c4c18dd02360371790722 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:33:06 +0100 Subject: Btrfs: add new sources for device replace code This adds a new file to the sources together with the header file and the changes to ioctl.h and ctree.h that are required by the new C source file. Additionally, 4 new functions are added to volume.c that deal with device creation and destruction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 2 + fs/btrfs/dev-replace.c | 856 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/dev-replace.h | 18 ++ fs/btrfs/ioctl.h | 45 +++ fs/btrfs/volumes.c | 139 ++++++++ fs/btrfs/volumes.h | 8 + 7 files changed, 1069 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/dev-replace.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d7fcdba141a2..7df3e0f0ee51 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ded7caa0d304..45e7f752b64a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -142,6 +142,8 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0 + /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 000000000000..66dbc8dbddf7 --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", + (unsigned long long)src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for dev_replace item!\n", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + pr_warn("btrfs: insert dev_replace item failed %d!\n", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ + struct timespec t = CURRENT_TIME_SEC; + + return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + &tgt_device); + if (ret) { + pr_err("btrfs: target device %s is invalid!\n", + args->start.tgtdev_name); + mutex_unlock(&fs_info->volume_mutex); + return -EINVAL; + } + + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) { + ret = -EINVAL; + goto leave_no_lock; + } + + if (tgt_device->total_bytes < src_device->total_bytes) { + pr_err("btrfs: target device is smaller than source device!\n"); + ret = -EINVAL; + goto leave_no_lock; + } + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_extents(root, 0); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + src_device->total_bytes, + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + WARN_ON(ret); + + return 0; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + + if (scrub_ret) { + printk_in_rcu(KERN_ERR + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; + } + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + tgt_device->bytes_used = src_device->bytes_used; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + if (src_device->bdev) { + /* zero out the old super */ + btrfs_scratch_superblock(src_device); + } + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, + div64_u64(dev_replace->srcdev->total_bytes, 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + pr_info("btrfs: suspending dev_replace for unmount\n"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + do_div(progress, 10); + printk_in_rcu(KERN_INFO + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + dev_replace->srcdev->total_bytes, + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 1fb5c89037ee..20035cbbf021 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -19,6 +19,24 @@ #if !defined(__BTRFS_DEV_REPLACE__) #define __BTRFS_DEV_REPLACE__ +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { atomic64_inc(stat_value); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 731e2875ab93..62006ba02719 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -123,6 +123,48 @@ struct btrfs_ioctl_scrub_args { __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; }; +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +struct btrfs_ioctl_dev_replace_start_params { + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ + __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u64 cont_reading_from_srcdev_mode; /* in, see #define + * above */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 +struct btrfs_ioctl_dev_replace_status_params { + __u64 replace_state; /* out, see #define above */ + __u64 progress_1000; /* out, 0 <= x <= 1000 */ + __u64 time_started; /* out, seconds since 1-Jan-1970 */ + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ + __u64 num_write_errors; /* out */ + __u64 num_uncorrectable_read_errors; /* out */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +struct btrfs_ioctl_dev_replace_args { + __u64 cmd; /* in */ + __u64 result; /* out */ + + union { + struct btrfs_ioctl_dev_replace_start_params start; + struct btrfs_ioctl_dev_replace_status_params status; + }; /* in/out */ + + __u64 spare[64]; +}; + #define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ @@ -453,4 +495,7 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_qgroup_limit_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_dev_replace_args) + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 415862885b67..5777e6a9aab1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1537,6 +1537,53 @@ error_undo: goto error_brelse; } +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + list_del_rcu(&srcdev->dev_list); + list_del_rcu(&srcdev->dev_alloc_list); + fs_info->fs_devices->num_devices--; + if (srcdev->missing) { + fs_info->fs_devices->missing_devices--; + fs_info->fs_devices->rw_devices++; + } + if (srcdev->can_discard) + fs_info->fs_devices->num_can_discard--; + if (srcdev->bdev) + fs_info->fs_devices->open_devices--; + + call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + struct btrfs_device *next_device; + + WARN_ON(!tgtdev); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + if (tgtdev->bdev) { + btrfs_scratch_superblock(tgtdev); + fs_info->fs_devices->open_devices--; + } + fs_info->fs_devices->num_devices--; + if (tgtdev->can_discard) + fs_info->fs_devices->num_can_discard++; + + next_device = list_entry(fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (tgtdev->bdev == fs_info->sb->s_bdev) + fs_info->sb->s_bdev = next_device->bdev; + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; + list_del_rcu(&tgtdev->dev_list); + + call_rcu(&tgtdev->rcu, free_device); + + mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device) { @@ -1931,6 +1978,98 @@ error: return ret; } +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out) +{ + struct request_queue *q; + struct btrfs_device *device; + struct block_device *bdev; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *devices; + struct rcu_string *name; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + ret = -ENOMEM; + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + device->devid = BTRFS_DEV_REPLACE_DEVID; + spin_lock_init(&device->io_lock); + device->generation = 0; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 1; + device->mode = FMODE_EXCL; + set_blocksize(device->bdev, 4096); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + if (device->can_discard) + fs_info->fs_devices->num_can_discard++; + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return ret; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + WARN_ON(fs_info->fs_devices->rw_devices == 0); + tgtdev->io_width = fs_info->dev_root->sectorsize; + tgtdev->io_align = fs_info->dev_root->sectorsize; + tgtdev->sector_size = fs_info->dev_root->sectorsize; + tgtdev->dev_root = fs_info->dev_root; + tgtdev->in_fs_metadata = 1; +} + static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8fd5a4d8acc8..58d79375deaf 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -286,6 +286,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); @@ -302,6 +304,12 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, -- cgit v1.2.1 From 8dabb7420f014ab0f9f04afae8ae046c0f48b270 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 13:15:27 +0100 Subject: Btrfs: change core code of btrfs to support the device replace operations This commit contains all the essential changes to the core code of Btrfs for support of the device replace procedure. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 24 +++++++++++++++++++++- fs/btrfs/reada.c | 17 ++++++++++++++++ fs/btrfs/scrub.c | 7 ++++++- fs/btrfs/super.c | 13 ++++++++++++ fs/btrfs/transaction.c | 7 ++++++- fs/btrfs/volumes.c | 54 ++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.h | 3 ++- 7 files changed, 111 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e410478ad27..76b82506bf92 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" +#include "dev-replace.h" #ifdef CONFIG_X86 #include @@ -2438,7 +2439,11 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2510,6 +2515,14 @@ retry_root_backup: goto fail_block_groups; } + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("btrfs: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2658,6 +2671,13 @@ retry_root_backup: return ret; } + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + return 0; fail_qgroup: @@ -3300,6 +3320,8 @@ int close_ctree(struct btrfs_root *root) /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9f363e17ec74..c705a48e676b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -27,6 +27,7 @@ #include "volumes.h" #include "disk-io.h" #include "transaction.h" +#include "dev-replace.h" #undef DEBUG @@ -331,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -392,6 +394,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -399,13 +402,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; if (dev == prev_dev) { @@ -422,6 +429,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* cannot read ahead on missing device */ continue; } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { @@ -434,10 +449,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } } spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); kfree(bbio); return re; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61157a26cf2a..30cbf6921c0b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2843,12 +2843,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - if (dev->scrub_device) { + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -EINPROGRESS; } + btrfs_dev_replace_unlock(&fs_info->dev_replace); sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ad4380684b9b..def4f24b58df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "export.h" #include "compression.h" #include "rcu-string.h" +#include "dev-replace.h" #define CREATE_TRACE_POINTS #include @@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ sb->s_flags |= MS_RDONLY; + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + ret = btrfs_commit_super(root); if (ret) goto restore; @@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + goto restore; + } sb->s_flags &= ~MS_RDONLY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b297354e738..bcc6b65be3b0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "tree-log.h" #include "inode-map.h" #include "volumes.h" +#include "dev-replace.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -845,7 +846,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - BUG_ON(ret); + WARN_ON(ret); + ret = btrfs_run_dev_replace(trans, root->fs_info); + WARN_ON(ret); ret = btrfs_run_qgroups(trans, root->fs_info); BUG_ON(ret); @@ -868,6 +871,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, switch_commit_root(fs_info->extent_root); up_write(&fs_info->extent_commit_sem); + btrfs_after_dev_replace_commit(fs_info); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5777e6a9aab1..a4e0963bf457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "math.h" +#include "dev-replace.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -505,7 +506,8 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; @@ -528,6 +530,21 @@ again: continue; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -536,7 +553,8 @@ again: if (device->writeable) { list_del_init(&device->dev_alloc_list); device->writeable = 0; - fs_devices->rw_devices--; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -594,7 +612,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -718,7 +736,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -1350,16 +1368,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_system_alloc_bits | root->fs_info->avail_metadata_alloc_bits; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -2935,6 +2959,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int mixed = 0; int ret; + u64 num_devices; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -2963,10 +2988,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) + if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) + else if (num_devices < 4) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -3591,6 +3623,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; + WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -4773,6 +4806,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 58d79375deaf..37d0157167b0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,7 +268,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -- cgit v1.2.1 From 29a8d9a0bce6a5abac1f313400c2e189e8d10e67 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:16:24 +0100 Subject: Btrfs: introduce GET_READ_MIRRORS functionality for btrfs_map_block() Before this commit, btrfs_map_block() was called with REQ_WRITE in order to retrieve the list of mirrors for a disk block. This needs to be changed for the device replace procedure since it makes a difference whether you are asking for read mirrors or for locations to write to. GET_READ_MIRRORS is introduced as a new interface to call btrfs_map_block(). In the current commit, the functionality is not yet changed, only the interface for GET_READ_MIRRORS is introduced and all the places that should use this new interface are adapted. The reason that REQ_WRITE cannot be abused anymore to retrieve a list of read mirrors is that during a running dev replace operation all write requests to the live filesystem are duplicated to also write to the target drive. Keep in mind that the target disk is only partially a valid copy of the source disk while the operation is ongoing. All writes go to the target disk, but not all reads would return valid data on the target disk. Therefore it is not possible anymore to abuse a REQ_WRITE interface to find valid mirrors for a REQ_READ. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/reada.c | 3 ++- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45e7f752b64a..46bd7d5f504b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 #define BTRFS_FT_DIR 2 diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index c705a48e676b..96b93daa0bbb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -359,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30cbf6921c0b..30ba99724896 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1193,8 +1193,8 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, - &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); return -EIO; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4e0963bf457..de0c05cca390 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4103,7 +4103,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr_end - stripe_nr_orig); stripe_index = do_div(stripe_nr, map->num_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -4115,7 +4115,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -4129,7 +4129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (rw & REQ_WRITE) + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) num_stripes = map->sub_stripes; else if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->sub_stripes * @@ -4242,7 +4242,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & REQ_WRITE) { + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { -- cgit v1.2.1 From 472262f35a6b3407e761b700d74c53530e5f144d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:43:46 +0100 Subject: Btrfs: changes to live filesystem are also written to replacement disk During a running dev replace operation, all write requests to the live filesystem are duplicated to also write to the target drive. Therefore btrfs_map_block() is changed to duplicate stripes that are written to the source disk of a device replace procedure to be written to the target disk as well. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index de0c05cca390..4d3bf187ab52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4044,6 +4044,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + int num_alloc_stripes; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4089,6 +4092,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!bbio_ret) goto out; + btrfs_dev_replace_lock(dev_replace); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (!dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4155,7 +4163,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } BUG_ON(stripe_index >= map->num_stripes); - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) + num_alloc_stripes <<= 1; + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; goto out; @@ -4250,11 +4261,48 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + dev_replace->tgtdev != NULL) { + int index_where_to_add; + u64 srcdev_devid = dev_replace->srcdev->devid; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk + * to the new disk takes place at run time while the + * filesystem is mounted writable, the regular write + * operations to the old disk have to be duplicated to go + * to the new disk as well. + * Note that device->missing is handled by the caller, and + * that the write to the old disk is already set up in the + * stripes array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + index_where_to_add++; + max_errors++; + } + } + num_stripes = index_where_to_add; + } + *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; out: + if (dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); free_extent_map(em); return ret; } -- cgit v1.2.1 From 30d9861ff9520e2a112eae71029bc9f7e915a441 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:52:18 +0100 Subject: Btrfs: optionally avoid reads from device replace source drive It is desirable to be able to configure the device replace procedure to avoid reading the source drive (the one to be copied) whenever possible. This is useful when the number of read errors on this disk is high, because it would delay the copy procedure alot. Therefore there is an option to avoid reading from the source disk unless the repair procedure really needs to access it. The regular read req asks for mapping the block with mirror_num == 0, in this case the source disk is avoided whenever possible. The repair code selects the mirror_num explicitly (mirror_num != 0), this case is not changed by this commit. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4d3bf187ab52..e2e01a327108 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4007,16 +4007,37 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } -static int find_live_mirror(struct map_lookup *map, int first, int num, - int optimal) +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, int num, + int optimal, int dev_replace_is_ongoing) { int i; - if (map->stripes[optimal].dev->bdev) - return optimal; - for (i = first; i < first + num; i++) { - if (map->stripes[i].dev->bdev) - return i; + int tolerance; + struct btrfs_device *srcdev; + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[optimal].dev->bdev && + (tolerance || map->stripes[optimal].dev != srcdev)) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } } + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ @@ -4116,9 +4137,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, else if (mirror_num) stripe_index = mirror_num - 1; else { - stripe_index = find_live_mirror(map, 0, + stripe_index = find_live_mirror(fs_info, map, 0, map->num_stripes, - current->pid % map->num_stripes); + current->pid % map->num_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index + 1; } @@ -4147,9 +4169,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index += mirror_num - 1; else { int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(map, stripe_index, + stripe_index = find_live_mirror(fs_info, map, + stripe_index, map->sub_stripes, stripe_index + - current->pid % map->sub_stripes); + current->pid % map->sub_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } } else { -- cgit v1.2.1 From 72d7aefccd512b66cd5543e652eae04be12085fc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:57:46 +0100 Subject: Btrfs: increase BTRFS_MAX_MIRRORS by one for dev replace This change of the define is effective in all modes, it is required and used only in the case when a device replace procedure is running. The reason is that during an active device replace procedure, the target device of the copy operation is a mirror for the filesystem data as well that can be used to read data in order to repair read errors on other disks. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 46bd7d5f504b..91ff078e85df 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,7 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 +#define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 -- cgit v1.2.1 From ad6d620e2a5704f6bf3a39c92a75aad962c51cb3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:06:47 +0100 Subject: Btrfs: allow repair code to include target disk when searching mirrors Make the target disk of a running device replace operation available for reading. This is only used as a last ressort for the defect repair procedure. And it is dependent on the location of the data block to read, because during an ongoing device replace operation, the target drive is only partially filled with the filesystem data. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e2e01a327108..32a4948b621c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4004,6 +4004,12 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else ret = 1; free_extent_map(em); + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + ret++; + btrfs_dev_replace_unlock(&fs_info->dev_replace); + return ret; } @@ -4068,6 +4074,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4084,9 +4092,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -4118,6 +4123,88 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && + dev_replace->tgtdev != NULL) { + /* + * in dev-replace case, for repair case (that's the only + * case where the mirror is selected explicitly when + * calling btrfs_map_block), blocks left of the left cursor + * can also be read from the target drive. + * For REQ_GET_READ_MIRRORS, the target drive is added as + * the last one to the array of stripes. For READ, it also + * needs to be supported using the same mirror number. + * If the requested block is not left of the left cursor, + * EIO is returned. This can happen because btrfs_num_copies() + * returns one more in the dev-replace case. + */ + u64 tmp_length = *length; + struct btrfs_bio *tmp_bbio = NULL; + int tmp_num_stripes; + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + logical, &tmp_length, &tmp_bbio, 0); + if (ret) { + WARN_ON(tmp_bbio != NULL); + goto out; + } + + tmp_num_stripes = tmp_bbio->num_stripes; + if (mirror_num > tmp_num_stripes) { + /* + * REQ_GET_READ_MIRRORS does not contain this + * mirror, that means that the requested area + * is not left of the left cursor + */ + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + /* + * process the rest of the function using the mirror_num + * of the source drive. Therefore look it up first. + * At the end, patch the device pointer to the one of the + * target drive. + */ + for (i = 0; i < tmp_num_stripes; i++) { + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + tmp_bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = + tmp_bbio->stripes[i].physical; + } + } + + if (found) { + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; + } else { + WARN_ON(1); + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + kfree(tmp_bbio); + } else if (mirror_num > map->num_stripes) { + mirror_num = 0; + } + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4188,8 +4275,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, BUG_ON(stripe_index >= map->num_stripes); num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) - num_alloc_stripes <<= 1; + if (dev_replace_is_ongoing) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_alloc_stripes <<= 1; + if (rw & REQ_GET_READ_MIRRORS) + num_alloc_stripes++; + } bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; @@ -4318,12 +4409,70 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + dev_replace->tgtdev != NULL) { + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can + * also be used to read data in case it is needed to repair + * a corrupt block elsewhere. This is possible if the + * requested area is left of the left cursor. In this area, + * the target drive is a full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + u64 length = map->stripe_len; + + if (physical_of_found + length <= + dev_replace->cursor_left) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + + num_stripes++; + } + } } *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; + + /* + * this is the case that REQ_READ && dev_replace_is_ongoing && + * mirror_num == num_stripes + 1 && dev_replace target drive is + * available as a mirror + */ + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { + WARN_ON(num_stripes > 1); + bbio->stripes[0].dev = dev_replace->tgtdev; + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; + bbio->mirror_num = map->num_stripes + 1; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); -- cgit v1.2.1 From e180377f1ae48b3cbc559c9875d9b038f7f000c6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:59 -0800 Subject: thp: change split_huge_page_pmd() interface Pass vma instead of mm and add address parameter. In most cases we already have vma on the stack. We provides split_huge_page_pmd_mm() for few cases when we have mm, but not vma. This change is preparation to huge zero pmd splitting implementation. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90c63f9392a5..291a0d15a0be 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - split_huge_page_pmd(walk->mm, pmd); + split_huge_page_pmd(vma, addr, pmd); if (pmd_trans_unstable(pmd)) return 0; -- cgit v1.2.1 From 4ff1b2c29326a2a3e130b46f69b7ab0e853d09d8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:25 -0800 Subject: procfs: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- fs/proc/task_mmu.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 86c67eee439f..e96d4f18ca3a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -249,7 +249,7 @@ static int kcore_update_ram(void) /* Not inialized....update now */ /* find out "max pfn" */ end_pfn = 0; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long node_end; node_end = NODE_DATA(nid)->node_start_pfn + NODE_DATA(nid)->node_spanned_pages; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 291a0d15a0be..48775628abbf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; nid = page_to_nid(page); - if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; @@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); - for_each_node_state(n, N_HIGH_MEMORY) + for_each_node_state(n, N_MEMORY) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: -- cgit v1.2.1 From 5aaea51dfbddcccaf38eacd03379f47c99bbe944 Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Wed, 12 Dec 2012 13:52:14 -0800 Subject: writeback: fix a typo in comment Signed-off-by: Yan Hong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3e3422f7f0a4..310972b72a66 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data) while (!kthread_freezable_should_stop(NULL)) { /* * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the preriodic write-back. + * and we'll take care of the periodic write-back. */ del_timer(&wb->wakeup_timer); -- cgit v1.2.1 From a3f3c29cb290a2d5d26e3cf5504f447fd7256a81 Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Wed, 12 Dec 2012 13:52:15 -0800 Subject: fs/buffer.c: do not inline exported function It makes no sense to inline an exported function. Signed-off-by: Yan Hong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6e9ed48064fc..9083e528e3c9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) -inline void -init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) { bh->b_end_io = handler; bh->b_private = private; -- cgit v1.2.1 From 02c0ab684fc41bc13ba8d5ad89b0dc73b092fa08 Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Wed, 12 Dec 2012 13:52:16 -0800 Subject: fs/buffer.c: remove redundant initialization in alloc_page_buffers() buffer_head comes from kmem_cache_zalloc(), no need to zero its fields. Signed-off-by: Yan Hong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9083e528e3c9..c017a2dfb909 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -849,13 +849,10 @@ try_again: if (!bh) goto no_grow; - bh->b_bdev = NULL; bh->b_this_page = head; bh->b_blocknr = -1; head = bh; - bh->b_state = 0; - atomic_set(&bh->b_count, 0); bh->b_size = size; /* Link the buffer to its page */ -- cgit v1.2.1 From c628937803c652132d21f383736375e2feee4bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Tue, 11 Dec 2012 17:49:24 -0800 Subject: pstore/ram: Fix bounds checks for mem_size, record_size, console_size and ftrace_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bounds check in ramoops_init_prz was incorrect and ramoops_init_przs had no check. Additionally, ramoops_init_przs allows record_size to be 0, but ramoops_pstore_write_buf would always crash in this case. Signed-off-by: Arve Hjønnevåg Signed-off-by: Anton Vorontsov --- fs/pstore/ram.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8741cea6253c..dba70e53b72c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -189,7 +189,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; - struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; + struct persistent_ram_zone *prz; size_t hlen; if (type == PSTORE_TYPE_CONSOLE) { @@ -226,6 +226,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, if (part != 1) return -ENOSPC; + if (!cxt->przs) + return -ENOSPC; + + prz = cxt->przs[cxt->dump_write_cnt]; + hlen = ramoops_write_kmsg_hdr(prz); if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; @@ -297,6 +302,11 @@ static int __devinit ramoops_init_przs(struct device *dev, if (!cxt->record_size) return 0; + if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for dumps\n"); + return -ENOMEM; + } + cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; if (!cxt->max_dump_cnt) return -ENOMEM; @@ -335,8 +345,12 @@ static int __devinit ramoops_init_prz(struct device *dev, if (!sz) return 0; - if (*paddr + sz > *paddr + cxt->size) + if (*paddr + sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + sz, (unsigned long long)*paddr, + cxt->size, (unsigned long long)cxt->phys_addr); return -ENOMEM; + } *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); if (IS_ERR(*prz)) { -- cgit v1.2.1 From ebacfd1ece3bfa46296fc92c6f996cb5f7fc75e6 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 14 Nov 2012 18:48:15 -0800 Subject: pstore/ftrace: Adjust for ftrace_ops->func prototype change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the following warning: fs/pstore/ftrace.c:51:2: warning: initialization from incompatible pointer type [enabled by default] fs/pstore/ftrace.c:51:2: warning: (near initialization for ‘pstore_ftrace_ops.func’) [enabled by defaula Signed-off-by: Anton Vorontsov --- fs/pstore/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 2d57e1ac0115..43b12807a51d 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -28,7 +28,9 @@ #include "internal.h" static void notrace pstore_ftrace_call(unsigned long ip, - unsigned long parent_ip) + unsigned long parent_ip, + struct ftrace_ops *op, + struct pt_regs *regs) { unsigned long flags; struct pstore_ftrace_record rec = {}; -- cgit v1.2.1 From f259613a1e4b44a0cf85a5dafd931be96ee7c9e5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Dec 2012 15:14:36 +1100 Subject: NFS: avoid NULL dereference in nfs_destroy_server In rare circumstances, nfs_clone_server() of a v2 or v3 server can get an error between setting server->destory (to nfs_destroy_server), and calling nfs_start_lockd (which will set server->nlm_host). If this happens, nfs_clone_server will call nfs_free_server which will call nfs_destroy_server and thence nlmclnt_done(NULL). This causes the NULL to be dereferenced. So add a guard to only call nlmclnt_done() if ->nlm_host is not NULL. The other guards there are irrelevant as nlm_host can only be non-NULL if one of these flags are set - so remove those tests. (Thanks to Trond for this suggestion). This is suitable for any stable kernel since 2.6.25. Cc: stable@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c285e0a117e4..9f3c66438d0e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -615,8 +615,7 @@ EXPORT_SYMBOL_GPL(nfs_create_rpc_client); */ static void nfs_destroy_server(struct nfs_server *server) { - if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || - !(server->flags & NFS_MOUNT_LOCAL_FCNTL)) + if (server->nlm_host) nlmclnt_done(server->nlm_host); } -- cgit v1.2.1 From cfc84c9f73ab8a6933bd4f36efac1196cddad581 Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Tue, 20 Nov 2012 10:23:07 -0600 Subject: ceph: fix dentry reference leak in ceph_encode_fh(). dput() was not called in the error path. Signed-off-by: Cyril Roelandt Reviewed-by: Alex Elder --- fs/ceph/export.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 862887004d20..f350be34601f 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry = d_find_alias(inode); + struct dentry *dentry; struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + dentry = d_find_alias(inode); + /* if we found an alias, generate a connectable fh */ if (*max_len >= connected_handle_length && dentry) { dout("encode_fh %p connectable\n", dentry); -- cgit v1.2.1 From 83aff95eb9d60aff5497e9f44a2ae906b86d8e88 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Nov 2012 12:28:24 -0800 Subject: libceph: remove 'osdtimeout' option This would reset a connection with any OSD that had an outstanding request that was taking more than N seconds. The idea was that if the OSD was buggy, the client could compensate by resending the request. In reality, this only served to hide server bugs, and we haven't actually seen such a bug in quite a while. Moreover, the userspace client code never did this. More importantly, often the request is taking a long time because the OSD is trying to recover, or overloaded, and killing the connection and retrying would only make the situation worse by giving the OSD more work to do. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2f586b0e5e0f..fcda1c73a1e5 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) seq_printf(m, ",osdkeepalivetimeout=%d", opt->osd_keepalive_timeout); -- cgit v1.2.1 From d2cc4dde9206aa2c7fb237aa689d3277cc070547 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 29 Nov 2012 08:37:03 -0600 Subject: bdi_register: add __printf verification, fix arg mismatch __printf is useful to verify format and arguments. Signed-off-by: Joe Perches Reviewed-by: Alex Elder --- fs/ceph/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fcda1c73a1e5..1a144001b2e1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -842,7 +842,7 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = default_backing_dev_info.ra_pages; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &fsc->backing_dev_info; -- cgit v1.2.1 From 5e62ad30157d0da04cf40c6d1a2f4bc840948b9c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:04 +0800 Subject: ceph: Don't update i_max_size when handling non-auth cap The cap from non-auth mds doesn't have a meaningful max_size value. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2d0141e95c88..8072aefc427c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2390,7 +2390,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, &atime); /* max size increase? */ - if (max_size != ci->i_max_size) { + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { -- cgit v1.2.1 From ed75ec2cd19b47efcd292b6e23f58e56f4c5bc34 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:06 +0800 Subject: ceph: Fix infinite loop in __wake_requests __wake_requests() will enter infinite loop if we use it to wake requests in the session->s_waiting list. __wake_requests() deletes requests from the list and __do_request() adds requests back to the list. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 62d2342eb267..9165eb8309eb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1876,9 +1876,14 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { - struct ceph_mds_request *req, *nreq; + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); + + list_splice_init(head, &tmp_list); - list_for_each_entry_safe(req, nreq, head, r_wait) { + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); __do_request(mdsc, req); } -- cgit v1.2.1 From 0685235ffd9dbdb9ccbda587f8a3c83ad1d5a921 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:07 +0800 Subject: ceph: Don't add dirty inode to dirty list if caps is in migration Add dirty inode to cap_dirty_migrating list instead, this can avoid ceph_flush_dirty_caps() entering infinite loop. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8072aefc427c..5efa3f5e2f77 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1351,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (!ci->i_head_snapc) ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, - ci->i_head_snapc); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + if (ci->i_auth_cap) + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + else + list_add(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); -- cgit v1.2.1 From a85f50b6ef93fbbb2ae932ce9b2376509d172796 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:08 +0800 Subject: ceph: Fix __ceph_do_pending_vmtruncate we should set i_truncate_pending to 0 after page cache is truncated to i_truncate_size Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4b5762ef7c2b..81613bced19f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; - int wrbuffer_refs, wake = 0; + int wrbuffer_refs, finish = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1498,15 +1498,18 @@ retry: truncate_inode_pages(inode->i_mapping, to); spin_lock(&ci->i_ceph_lock); - ci->i_truncate_pending--; - if (ci->i_truncate_pending == 0) - wake = 1; + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - if (wake) - wake_up_all(&ci->i_cap_wq); + + wake_up_all(&ci->i_cap_wq); } -- cgit v1.2.1 From 0e5e1774a92e6fe9c511585de8f078b4c4c68dbb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Nov 2012 10:49:09 +0800 Subject: ceph: call handle_cap_grant() for cap import message If client sends cap message that requests new max size during exporting caps, the exporting MDS will drop the message quietly. So the client may wait for the reply that updates the max size forever. call handle_cap_grant() for cap import message can avoid this issue. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5efa3f5e2f77..a1d9bb30c1bf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2751,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, /* make sure we re-request max_size, if necessary */ spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } @@ -2846,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), 0, session); - goto done_unlocked; } /* the rest require a cap */ @@ -2864,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: + case CEPH_CAP_OP_IMPORT: handle_cap_grant(inode, h, session, cap, msg->middle); goto done_unlocked; -- cgit v1.2.1 From 8884d53dd63b1d9315b343564fcbe1ede004a99e Mon Sep 17 00:00:00 2001 From: David Zafman Date: Mon, 3 Dec 2012 19:14:05 -0800 Subject: libceph: Unlock unprocessed pages in start_read() error path Function start_read() can get an error before processing all pages. It must not only release the remaining pages, but unlock them too. This fixes http://tracker.newdream.net/issues/3370 Signed-off-by: David Zafman Reviewed-by: Alex Elder --- fs/ceph/addr.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 21a07187df05..8e8a818cba07 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) kfree(req->r_pages); } +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + unlock_page(pages[i]); +} + /* * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. @@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) return nr_pages; out_pages: + ceph_unlock_page_vector(pages, nr_pages); ceph_release_page_vector(pages, nr_pages); out: ceph_osdc_put_request(req); -- cgit v1.2.1 From 2fb7d99d0de3fd8ae869f35ab682581d8455887a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 10 Oct 2012 00:08:56 +0900 Subject: udf: fix memory leak while allocating blocks during write Need to brelse the buffer_head stored in cur_epos and next_epos. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Jan Kara --- fs/udf/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index df88b957ccf0..2b7759371ff6 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -765,6 +765,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goal, err); if (!newblocknum) { brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); *err = -ENOSPC; return 0; } @@ -795,6 +797,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); newblock = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); -- cgit v1.2.1 From fb719c59bdb4fca86ee1fd1f42ab3735ca12b6b2 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 10 Oct 2012 00:09:12 +0900 Subject: udf: don't increment lenExtents while writing to a hole Incrementing lenExtents even while writing to a hole is bad for performance as calls to udf_discard_prealloc and udf_truncate_tail_extent would not return from start if isize != lenExtents Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Jan Kara --- fs/udf/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 2b7759371ff6..8266f2ed7fc4 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -601,6 +601,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct udf_inode_info *iinfo = UDF_I(inode); int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; + bool isBeyondEOF; *err = 0; *new = 0; @@ -680,7 +681,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Are we beyond EOF? */ if (etype == -1) { int ret; - + isBeyondEOF = 1; if (count) { if (c) laarr[0] = laarr[1]; @@ -723,6 +724,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, endnum = c + 1; lastblock = 1; } else { + isBeyondEOF = 0; endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, @@ -770,7 +772,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, *err = -ENOSPC; return 0; } - iinfo->i_lenExtents += inode->i_sb->s_blocksize; + if (isBeyondEOF) + iinfo->i_lenExtents += inode->i_sb->s_blocksize; } /* if the extent the requsted block is located in contains multiple -- cgit v1.2.1 From 6d31d15f21b376ac0d8a2323fd6673683bc82bd6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 10 Oct 2012 00:09:44 +0900 Subject: udf: remove un-needed variable from inode_getblk The variable last_block is not needed. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Jan Kara --- fs/udf/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8266f2ed7fc4..cbae1ed0b7c1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -587,7 +587,6 @@ out: static sector_t inode_getblk(struct inode *inode, sector_t block, int *err, int *new) { - static sector_t last_block; struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; @@ -677,7 +676,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, return newblock; } - last_block = block; /* Are we beyond EOF? */ if (etype == -1) { int ret; @@ -719,7 +717,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, memset(&laarr[c].extLocation, 0x00, sizeof(struct kernel_lb_addr)); count++; - endnum++; } endnum = c + 1; lastblock = 1; -- cgit v1.2.1 From 195c0f96f0f96da317e22c9851a7ecc1a541f9ad Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Mon, 29 Oct 2012 15:28:07 +0800 Subject: ext3: get rid of the duplicate code on ext3_fill_super Setting s_mount_opt to 0 is unnecessary because we use kzalloc() for sb allocation. s_resuid and s_resgid are set again few lines below based on values in on disk superblock. Signed-off-by: Zhao Hongjiang Signed-off-by: Jan Kara --- fs/ext3/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5366393528df..6e50223b3299 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1661,9 +1661,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) return -ENOMEM; } sb->s_fs_info = sbi; - sbi->s_mount_opt = 0; - sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID); sbi->s_sb_block = sb_block; blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); -- cgit v1.2.1 From 4789775477718983b6f8c604a809b950c6a10052 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 21:30:21 +0100 Subject: ext3: drop if around WARN_ON Just use WARN_ON rather than an if containing only WARN_ON(1). A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression e; @@ - if (e) WARN_ON(1); + WARN_ON(e); // Signed-off-by: Julia Lawall Signed-off-by: Jan Kara --- fs/ext3/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 7e87e37a372a..b176d4253544 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1071,8 +1071,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, * mapped. 0 in case of a HOLE. */ if (err > 0) { - if (err > 1) - WARN_ON(1); + WARN_ON(err > 1); err = 0; } *errp = err; -- cgit v1.2.1 From 56df127855b593cc4b2e94ce8df5c609b0109b42 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Sat, 3 Nov 2012 23:02:28 +0100 Subject: quota: Use the pre-processor to compile out quotactl_cmd_write when !CONFIG_BLOCK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit quotactl_cmd_write() is only ever invoked when BLOCK is configured. When !CONFIG_BLOCK, the build warning below is displayed. Let's fix that. fs/quota/quota.c:311:12: warning: ‘quotactl_cmd_write’ defined but not used [-Wunused-function] Cc: Jan Kara Signed-off-by: Lee Jones Signed-off-by: Jan Kara --- fs/quota/quota.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index af1661f7a54f..c7314f1771f5 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -307,6 +307,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, } } +#ifdef CONFIG_BLOCK + /* Return 1 if 'cmd' will block on frozen filesystem */ static int quotactl_cmd_write(int cmd) { @@ -322,6 +324,8 @@ static int quotactl_cmd_write(int cmd) return 1; } +#endif /* CONFIG_BLOCK */ + /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon -- cgit v1.2.1 From aaea7d2f78d008882524eddff0d78098c8fa9496 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Thu, 13 Dec 2012 14:37:34 +0800 Subject: nfs: Remove duplicate function declaration in internal.h Remove duplicate function declaration in internal.h Signed-off-by: Yanchuan Nian [Trond: Added nfs_pageio_init_read, which suffered from the same problem] Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fb994471bd32..89c1ee4a432c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -363,9 +363,6 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr); -extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); @@ -388,9 +385,6 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void); extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr); -extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, - const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_commit_data *p); -- cgit v1.2.1 From 48d7a57693af660666c4afdc54c09b2f9655e260 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Thu, 13 Dec 2012 14:37:52 +0800 Subject: nfs: Remove unused list nfs4_clientid_list This list was designed to store struct nfs4_client in the client side. But nfs4_client was obsolete and has been removed from the source code. So remove the unused list. Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8dcbd9a0367d..9448c579d41a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -67,7 +67,6 @@ const nfs4_stateid zero_stateid; static DEFINE_MUTEX(nfs_clid_init_mutex); -static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { -- cgit v1.2.1 From f55fb0c24386cee8b78f60d60186716bd0909493 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 14 Dec 2012 10:23:23 +0800 Subject: autofs4 - dont clear DCACHE_NEED_AUTOMOUNT on rootless mount The DCACHE_NEED_AUTOMOUNT flag is cleared on mount and set on expire for autofs rootless multi-mount dentrys to prevent unnecessary calls to ->d_automount(). Since DCACHE_MANAGE_TRANSIT is always set on autofs dentrys ->d_managed() is always called so the check can be done in ->d_manage() without the need to change the flag. This still avoids unnecessary calls to ->d_automount(), adds negligible overhead and eliminates a seriously ugly check in the expire code. Signed-off-by: Ian Kent Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 9 -------- fs/autofs4/root.c | 61 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 842d00048a65..01443ce43ee7 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -548,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_EXPIRING; - spin_lock(&dentry->d_lock); - if (!ret) { - if ((IS_ROOT(dentry) || - (autofs_type_indirect(sbi->type) && - IS_ROOT(dentry->d_parent))) && - !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - __managed_dentry_set_automount(dentry); - } - spin_unlock(&dentry->d_lock); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 91b11650722e..30a6ab66e99a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -355,7 +355,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path) status = autofs4_mount_wait(dentry); if (status) return ERR_PTR(status); - spin_lock(&sbi->fs_lock); goto done; } @@ -364,8 +363,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + spin_unlock(&sbi->fs_lock); goto done; + } + if (!d_mountpoint(dentry)) { /* * It's possible that user space hasn't removed directories @@ -379,8 +381,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) + if (have_submounts(dentry)) { + spin_unlock(&sbi->fs_lock); goto done; + } } else { spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { @@ -399,28 +403,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path) return ERR_PTR(status); } } -done: - if (!(ino->flags & AUTOFS_INF_EXPIRING)) { - /* - * Any needed mounting has been completed and the path - * updated so clear DCACHE_NEED_AUTOMOUNT so we don't - * call ->d_automount() on rootless multi-mounts since - * it can lead to an incorrect ELOOP error return. - * - * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and - * symlinks as in all other cases the dentry will be covered by - * an actual mount so ->d_automount() won't be called during - * the follow. - */ - spin_lock(&dentry->d_lock); - if ((!d_mountpoint(dentry) && - !list_empty(&dentry->d_subdirs)) || - (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) - __managed_dentry_clear_automount(dentry); - spin_unlock(&dentry->d_lock); - } spin_unlock(&sbi->fs_lock); - +done: /* Mount succeeded, check if we ended up with a new dentry */ dentry = autofs4_mountpoint_changed(path); if (!dentry) @@ -432,6 +416,8 @@ done: int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; DPRINTK("dentry=%p %.*s", dentry, dentry->d_name.len, dentry->d_name.name); @@ -456,7 +442,32 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - return autofs4_mount_wait(dentry); + status = autofs4_mount_wait(dentry); + if (status) + return status; + + spin_lock(&sbi->fs_lock); + /* + * If the dentry has been selected for expire while we slept + * on the lock then it might go away. We'll deal with that in + * ->d_automount() and wait on a new mount if the expire + * succeeds or return here if it doesn't (since there's no + * mount to follow with a rootless multi-mount). + */ + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path + * updated so check if this is a rootless multi-mount so + * we can avoid needless calls ->d_automount() and avoid + * an incorrect ELOOP error return. + */ + if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + status = -EISDIR; + } + spin_unlock(&sbi->fs_lock); + + return status; } /* Lookups in the root directory */ -- cgit v1.2.1 From 0259cb02c4004d3088b0999799f8f5c8801f6b97 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 14 Dec 2012 10:23:29 +0800 Subject: autofs4 - use simple_empty() for empty directory check For direct (and offset) mounts, if an automounted mount is manually umounted the trigger mount dentry can appear non-empty causing it to not trigger mounts. This can also happen if there is a file handle leak in a user space automounting application. This happens because, when a ioctl control file handle is opened on the mount, a cursor dentry is created which causes list_empty() to see the dentry as non-empty. Since there is a case where listing the directory of these dentrys is needed, the use of dcache_dir_*() functions for .open() and .release() is needed. Consequently simple_empty() must be used instead of list_empty() when checking for an empty directory. Signed-off-by: Ian Kent Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 30a6ab66e99a..c93447604da8 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - spin_lock(&dentry->d_lock); - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!d_mountpoint(dentry) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } - spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); out: @@ -386,12 +383,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path) goto done; } } else { - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!simple_empty(dentry)) goto done; - } - spin_unlock(&dentry->d_lock); } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); @@ -610,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) spin_lock(&sbi->lookup_lock); __autofs4_add_expiring(dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); + d_drop(dentry); spin_unlock(&sbi->lookup_lock); return 0; @@ -683,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return -EACCES; spin_lock(&sbi->lookup_lock); - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } __autofs4_add_expiring(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); + d_drop(dentry); spin_unlock(&sbi->lookup_lock); if (sbi->version < 5) -- cgit v1.2.1 From 861d66601acda6d7a2038fb3c95f68009128003a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 30 Nov 2012 16:03:31 +0200 Subject: exofs: don't leak io_state and pages on read error Same bug as fixed by Idan for write_exec was in read_exec. Fix the io_state leak and pages state on read error. Also while at it: The if (!pcol->read_4_write) at the error path is redundant because all goto err; are after the if (pcol->read_4_write) bale out. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 1634b946565f..d1f80abd8828 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -361,12 +361,12 @@ static int read_exec(struct page_collect *pcol) return 0; err: - if (!pcol->read_4_write) - _unlock_pcol_pages(pcol, ret, READ); - - pcol_free(pcol); - + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, READ); + pcol_free(pcol_copy); kfree(pcol_copy); + return ret; } -- cgit v1.2.1 From eed9935745cc44071043ec8c4cde64c820b5c601 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 14 Dec 2012 14:36:36 -0500 Subject: NFS: Ensure that we always drop inodes that have been marked as stale There is no need to cache stale inodes. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 6 ++++++ fs/nfs/internal.h | 1 + fs/nfs/nfs4super.c | 1 + fs/nfs/super.c | 1 + 4 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 117183b1ee09..2faae14d89f4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -107,6 +107,12 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } +int nfs_drop_inode(struct inode *inode) +{ + return NFS_STALE(inode) || generic_drop_inode(inode); +} +EXPORT_SYMBOL_GPL(nfs_drop_inode); + void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 89c1ee4a432c..f0e6c7df1a07 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,6 +296,7 @@ extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index bd61221ad2c5..84d2e9e2f313 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs4_write_inode, + .drop_inode = nfs_drop_inode, .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs4_evict_inode, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e12cea4b36a5..aa5315bb3666 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -308,6 +308,7 @@ const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .drop_inode = nfs_drop_inode, .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs_evict_inode, -- cgit v1.2.1 From 1f018458b30b0d5c535c94e577aa0acbb92e1395 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 14 Dec 2012 16:38:46 -0500 Subject: NFS: Fix calls to drop_nlink() It is almost always wrong for NFS to call drop_nlink() after removing a file. What we really want is to mark the inode's attributes for revalidation, and we want to ensure that the VFS drops it if we're reasonably sure that this is the final unlink(). Do the former using the usual cache validity flags, and the latter by testing if inode->i_nlink == 1, and clearing it in that case. This also fixes the following warning reported by Neil Brown and Jeff Layton (among others). [634155.004438] WARNING: at /home/abuild/rpmbuild/BUILD/kernel-desktop-3.5.0/lin [634155.004442] Hardware name: Latitude E6510 [634155.004577] crc_itu_t crc32c_intel snd_hwdep snd_pcm snd_timer snd soundcor [634155.004609] Pid: 13402, comm: bash Tainted: G W 3.5.0-36-desktop # [634155.004611] Call Trace: [634155.004630] [] dump_trace+0xaa/0x2b0 [634155.004641] [] dump_stack+0x69/0x6f [634155.004653] [] warn_slowpath_common+0x7b/0xc0 [634155.004662] [] drop_nlink+0x34/0x40 [634155.004687] [] nfs_dentry_iput+0x33/0x70 [nfs] [634155.004714] [] dput+0x12e/0x230 [634155.004726] [] __fput+0x170/0x230 [634155.004735] [] filp_close+0x5f/0x90 [634155.004743] [] sys_close+0x97/0x100 [634155.004754] [] system_call_fastpath+0x16/0x1b [634155.004767] [<00007f2a73a0d110>] 0x7f2a73a0d10f Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [3.3+] --- fs/nfs/dir.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce8cb926526b..a46a74654488 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1155,11 +1155,14 @@ static int nfs_dentry_delete(const struct dentry *dentry) } +/* Ensure that we revalidate inode->i_nlink */ static void nfs_drop_nlink(struct inode *inode) { spin_lock(&inode->i_lock); - if (inode->i_nlink > 0) - drop_nlink(inode); + /* drop the inode if we're reasonably sure this is the last link */ + if (inode->i_nlink == 1) + clear_nlink(inode); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } @@ -1174,8 +1177,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - drop_nlink(inode); nfs_complete_unlink(dentry, inode); + nfs_drop_nlink(inode); } iput(inode); } @@ -1646,10 +1649,8 @@ static int nfs_safe_remove(struct dentry *dentry) if (inode != NULL) { NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); - /* The VFS may want to delete this inode */ if (error == 0) nfs_drop_nlink(inode); - nfs_mark_for_revalidate(inode); } else error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); if (error == -ENOENT) -- cgit v1.2.1 From 65a0c14954493802de01968a73b849f9fc4b4d1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 14 Dec 2012 17:51:40 -0500 Subject: NFS: nfs_lookup_revalidate should not trust an inode with i_nlink == 0 If the inode has no links, then we should force a new lookup. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a46a74654488..d8e58ed3d45c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -978,10 +978,11 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) * particular file and the "nocto" mount flag is not set. * */ -static inline +static int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) { struct nfs_server *server = NFS_SERVER(inode); + int ret; if (IS_AUTOMOUNT(inode)) return 0; @@ -992,9 +993,13 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) goto out_force; - return 0; +out: + return (inode->i_nlink == 0) ? -ENOENT : 0; out_force: - return __nfs_revalidate_inode(server, inode); + ret = __nfs_revalidate_inode(server, inode); + if (ret != 0) + return ret; + goto out; } /* -- cgit v1.2.1 From 5e4a08476b50fa39210fca82e03325cc46b9c235 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Dec 2012 07:55:36 -0800 Subject: userns: Require CAP_SYS_ADMIN for most uses of setns. Andy Lutomirski found a nasty little bug in the permissions of setns. With unprivileged user namespaces it became possible to create new namespaces without privilege. However the setns calls were relaxed to only require CAP_SYS_ADMIN in the user nameapce of the targed namespace. Which made the following nasty sequence possible. pid = clone(CLONE_NEWUSER | CLONE_NEWNS); if (pid == 0) { /* child */ system("mount --bind /home/me/passwd /etc/passwd"); } else if (pid != 0) { /* parent */ char path[PATH_MAX]; snprintf(path, sizeof(path), "/proc/%u/ns/mnt"); fd = open(path, O_RDONLY); setns(fd, 0); system("su -"); } Prevent this possibility by requiring CAP_SYS_ADMIN in the current user namespace when joing all but the user namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index c1bbe86f4920..398a50ff2438 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2781,7 +2781,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT)) + !nsown_capable(CAP_SYS_CHROOT) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) -- cgit v1.2.1 From e8794440849d1d15fa11251ef1622e6160614874 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 15 Dec 2012 13:56:18 -0500 Subject: NFSv4.1: Try to deal with NFS4ERR_SEQ_MISORDERED. If the server returns NFS4ERR_SEQ_MISORDERED, it could be a sign that the slot was retired at some point. Retry the attempt after reinitialising the slot sequence number to 1. Also add a handler for NFS4ERR_SEQ_FALSE_RETRY. Just bump the slot sequence number and retry... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b0963aeceeda..9003b8f6b77f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -467,11 +467,19 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * The slot id we used was probably retired. Try again * using a different slot id. */ - if (rpc_restart_call_prepare(task)) { - task->tk_status = 0; - ret = 0; - } - break; + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + /* + * Could this slot have been previously retired? + * If so, then the server may be expecting seq_nr = 1! + */ + if (slot->seq_nr == 1) + break; + slot->seq_nr = 1; + goto retry_nowait; + case -NFS4ERR_SEQ_FALSE_RETRY: + ++slot->seq_nr; + goto retry_nowait; default: /* Just update the slot sequence no. */ ++slot->seq_nr; @@ -481,6 +489,12 @@ out: dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); nfs41_sequence_free_slot(res); return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) { + task->tk_status = 0; + ret = 0; + } + goto out; out_retry: if (!rpc_restart_call(task)) goto out; -- cgit v1.2.1 From 8e63b6a8adabb0551124c3b78f7f5f36912c3728 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 15 Dec 2012 15:21:52 -0500 Subject: NFSv4.1: Move the RPC timestamp out of the slot. Shave a few bytes off the slot table size by moving the RPC timestamp into the sequence results. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 14 +++++++------- fs/nfs/nfs4session.c | 3 +-- fs/nfs/nfs4session.h | 1 - 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9003b8f6b77f..afb428e63b52 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -419,7 +419,6 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * { struct nfs4_session *session; struct nfs4_slot *slot; - unsigned long timestamp; struct nfs_client *clp; int ret = 1; @@ -444,9 +443,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * case 0: /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; - timestamp = slot->renewal_time; clp = session->clp; - do_renew_lease(clp, timestamp); + do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ if (res->sr_status_flags != 0) nfs4_schedule_lease_recovery(clp); @@ -473,10 +471,11 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * Could this slot have been previously retired? * If so, then the server may be expecting seq_nr = 1! */ - if (slot->seq_nr == 1) - break; - slot->seq_nr = 1; - goto retry_nowait; + if (slot->seq_nr != 1) { + slot->seq_nr = 1; + goto retry_nowait; + } + break; case -NFS4ERR_SEQ_FALSE_RETRY: ++slot->seq_nr; goto retry_nowait; @@ -567,6 +566,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, slot->slot_nr, slot->seq_nr); res->sr_slot = slot; + res->sr_timestamp = jiffies; res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 1e6c87c443a7..0e1cc1f4e51a 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -143,7 +143,6 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) if (slotid > tbl->highest_used_slotid || tbl->highest_used_slotid == NFS4_NO_SLOT) tbl->highest_used_slotid = slotid; - ret->renewal_time = jiffies; ret->generation = tbl->generation; out: @@ -228,9 +227,9 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) if (nfs4_session_draining(tbl->session) && !args->sa_privileged) return false; - slot->renewal_time = jiffies; slot->generation = tbl->generation; args->sa_slot = slot; + res->sr_timestamp = jiffies; res->sr_slot = slot; res->sr_status_flags = 0; res->sr_status = 1; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 04f834cab16c..d17b08091d4b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -19,7 +19,6 @@ struct nfs4_slot { struct nfs4_slot_table *table; struct nfs4_slot *next; unsigned long generation; - unsigned long renewal_time; u32 slot_nr; u32 seq_nr; }; -- cgit v1.2.1 From ac20d163fccf9fa6acec8b68f127003635e13b72 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 15 Dec 2012 15:36:07 -0500 Subject: NFSv4.1: Deal effectively with interrupted RPC calls. If an RPC call is interrupted, assume that the server hasn't processed the RPC call so that the next time we use the slot, we know that if we get a NFS4ERR_SEQ_MISORDERED or NFS4ERR_SEQ_FALSE_RETRY, we just have to bump the sequence number. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 32 +++++++++++++++++++++++--------- fs/nfs/nfs4session.c | 1 + fs/nfs/nfs4session.h | 1 + 3 files changed, 25 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index afb428e63b52..493f0f41c554 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -420,17 +420,9 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * struct nfs4_session *session; struct nfs4_slot *slot; struct nfs_client *clp; + bool interrupted = false; int ret = 1; - /* - * sr_status remains 1 if an RPC level error occurred. The server - * may or may not have processed the sequence operation.. - * Proceed as if the server received and processed the sequence - * operation. - */ - if (res->sr_status == 1) - res->sr_status = NFS_OK; - /* don't increment the sequence number if the task wasn't sent */ if (!RPC_WAS_SENT(task)) goto out; @@ -438,6 +430,11 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * slot = res->sr_slot; session = slot->table->session; + if (slot->interrupted) { + slot->interrupted = 0; + interrupted = true; + } + /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: @@ -450,6 +447,15 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * nfs4_schedule_lease_recovery(clp); nfs41_update_target_slotid(slot->table, slot, res); break; + case 1: + /* + * sr_status remains 1 if an RPC level error occurred. + * The server may or may not have processed the sequence + * operation.. + * Mark the slot as having hosted an interrupted RPC call. + */ + slot->interrupted = 1; + goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and * returned NFS4ERR_DELAY as per Section 2.10.6.2 @@ -467,6 +473,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * */ goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: + /* + * Was the last operation on this sequence interrupted? + * If so, retry after bumping the sequence number. + */ + if (interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } /* * Could this slot have been previously retired? * If so, then the server may be expecting seq_nr = 1! diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 0e1cc1f4e51a..ebda5f4a031b 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -172,6 +172,7 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, p = &tbl->slots; while (*p) { (*p)->seq_nr = ivalue; + (*p)->interrupted = 0; p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index d17b08091d4b..6f3cb39386d4 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -21,6 +21,7 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; + unsigned int interrupted : 1; }; /* Sessions */ -- cgit v1.2.1 From ada8e20d044c0fa5610e504ce6fb4578ebd3edd9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 15 Dec 2012 17:12:14 -0500 Subject: NFS: Don't use SetPageError in the NFS writeback code The writeback code is already capable of passing errors back to user space by means of the open_context->error. In the case of ENOSPC, Neil Brown is reporting seeing 2 errors being returned. Neil writes: "e.g. if /mnt2/ if an nfs mounted filesystem that has no space then strace dd if=/dev/zero conv=fsync >> /mnt2/afile count=1 reported Input/output error and the relevant parts of the strace output are: write(1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512 fsync(1) = -1 EIO (Input/output error) close(1) = -1 ENOSPC (No space left on device)" Neil then shows that the duplication of error messages appears to be due to the use of the PageError() mechanism, which causes filemap_fdatawait_range to return the extra EIO. The regression was introduced by commit 7b281ee026552f10862b617a2a51acf49c829554 (NFS: fsync() must exit with an error if page writeback failed). Fix this by removing the call to SetPageError(), and just relying on open_context->error reporting the ENOSPC back to fsync(). Reported-by: Neil Brown Tested-by: Neil Brown Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [3.6+] --- fs/nfs/write.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f608ca606b2b..5209916e1222 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -202,7 +202,6 @@ out: /* A writeback failed: mark the page as bad, and invalidate the page cache */ static void nfs_set_pageerror(struct page *page) { - SetPageError(page); nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } -- cgit v1.2.1 From 3f6bcfbd4149875662773eb40a62294cddf215d4 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:08:53 +0100 Subject: Btrfs: add support for device replace ioctls This is the commit that allows to start the device replace procedure. An ioctl() interface is added that supports starting and canceling the device replace procedure, and to retrieve the status and progress. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 7 ++++--- 2 files changed, 52 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e54b5e50c927..9a71fec86152 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@ #include "backref.h" #include "rcu-string.h" #include "send.h" +#include "dev-replace.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -3171,6 +3172,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (atomic_xchg( + &root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINPROGRESS; + } else { + ret = btrfs_dev_replace_start(root, p); + atomic_set( + &root->fs_info->mutually_exclusive_operation_running, + 0); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(root->fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + ret = btrfs_dev_replace_cancel(root->fs_info, p); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; + + kfree(p); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3826,6 +3872,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_create(root, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(root, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 62006ba02719..dabca9cc8c2e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 + #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) @@ -127,10 +129,10 @@ struct btrfs_ioctl_scrub_args { #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 struct btrfs_ioctl_dev_replace_start_params { __u64 srcdevid; /* in, if 0, use srcdev_name instead */ - __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ - __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ __u64 cont_reading_from_srcdev_mode; /* in, see #define * above */ + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ }; #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 @@ -165,7 +167,6 @@ struct btrfs_ioctl_dev_replace_args { __u64 spare[64]; }; -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ -- cgit v1.2.1 From 071401258a580dec2a3e0c2700b7e76f3ed43320 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 23 Nov 2012 03:03:14 +0000 Subject: Btrfs: do not warn_on io_ctl->cur in io_ctl_map_page io_ctl_map_page is called by many functions in free-space-cache. In most scenarios, the ->cur is not null, e.g. io_ctl_add_entry. I think we'd better remove the warn_on here. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 557502ca1a2a..efdd1d3f441c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) { - WARN_ON(io_ctl->cur); BUG_ON(io_ctl->index >= io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = kmap(io_ctl->page); -- cgit v1.2.1 From db2254bce4f19f458aaa05f9d00b39f413f7488c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 26 Nov 2012 02:58:36 +0000 Subject: Btrfs: fix an while-loop of listxattr If we found an invalid xattr dir item, we'd better try the next one instead. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3f4e2d69e83a..e9d384055494 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -265,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) - continue; + goto next; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; -- cgit v1.2.1 From 9a8c28bec1b40e934ed28149b7eaa7d2fafed92d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:40:43 +0000 Subject: Btrfs: pass root object into btrfs_ioctl_{start, wait}_sync() Since we have gotten the root in the caller, just pass it into btrfs_ioctl_{start, wait}_sync() directly. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a71fec86152..5022e62e63a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3058,9 +3058,9 @@ long btrfs_ioctl_trans_end(struct file *file) return 0; } -static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; struct btrfs_trans_handle *trans; u64 transid; int ret; @@ -3081,9 +3081,9 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp return 0; } -static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; u64 transid; if (argp) { @@ -3843,9 +3843,9 @@ long btrfs_ioctl(struct file *file, unsigned int btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; case BTRFS_IOC_START_SYNC: - return btrfs_ioctl_start_sync(file, argp); + return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: - return btrfs_ioctl_wait_sync(file, argp); + return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(root, argp); case BTRFS_IOC_SCRUB_CANCEL: -- cgit v1.2.1 From ff7c1d33551862c86f7737fe88edc3e499d291e6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:41:29 +0000 Subject: Btrfs: don't start a new transaction when starting sync If there is no running transaction in the fs, we needn't start a new one when we want to start sync. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 14 ++++++++++---- fs/btrfs/transaction.c | 13 ++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5022e62e63a8..7b1f614f51f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,16 +3065,22 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root, 0); if (ret) { btrfs_end_transaction(trans, root); return ret; } - +out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) return -EFAULT; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bcc6b65be3b0..8db401fa2f8f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1307,9 +1307,10 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); current->journal_info = ac->newtrans; @@ -1347,8 +1348,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + if (trans->type < TRANS_JOIN_NOLOCK) + rwsem_release( + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); schedule_delayed_work(&ac->work, 0); -- cgit v1.2.1 From 8cd2807f79b73ef2d8c1cb6b3732dc5758ac7212 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:42:07 +0000 Subject: Btrfs: fix wrong return value of btrfs_wait_for_commit() If the id of the existed transaction is more than the one we specified, it means the specified transaction was commited, so we should return 0, not EINVAL. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8db401fa2f8f..e6509b92433b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -456,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root, int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; - int ret; + int ret = 0; - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) goto out; + ret = -EINVAL; /* find specified transaction */ spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; atomic_inc(&cur_trans->use_count); + ret = 0; break; } - if (t->transid > transid) + if (t->transid > transid) { + ret = 0; break; + } } spin_unlock(&root->fs_info->trans_lock); - ret = -EINVAL; + /* The specified transaction doesn't exist */ if (!cur_trans) - goto out; /* bad transid */ + goto out; } else { /* find newest transaction that is committing | committed */ spin_lock(&root->fs_info->trans_lock); @@ -497,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } wait_for_commit(root, cur_trans); - put_transaction(cur_trans); - ret = 0; out: return ret; } -- cgit v1.2.1 From 3c04ce01053413007b9df88313b8b8e17272b57b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:07 +0000 Subject: Btrfs: get write access when setting the default subvolume When wen want to set the default subvolume, we must get write access, or we will change the R/O file system. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7b1f614f51f6..10bc65ed736c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2843,12 +2843,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_disk_key disk_key; u64 objectid = 0; u64 dir_id; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&objectid, argp, sizeof(objectid))) - return -EFAULT; + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } if (!objectid) objectid = root->root_key.objectid; @@ -2858,21 +2865,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - if (IS_ERR(new_root)) - return PTR_ERR(new_root); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } - if (btrfs_root_refs(&new_root->root_item) == 0) - return -ENOENT; + if (btrfs_root_refs(&new_root->root_item) == 0) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); - return PTR_ERR(trans); + ret = PTR_ERR(trans); + goto out; } dir_id = btrfs_super_root_dir(root->fs_info->super_copy); @@ -2883,7 +2897,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_end_transaction(trans, root); printk(KERN_ERR "Umm, you don't have the default dir item, " "this isn't going to work\n"); - return -ENOENT; + ret = -ENOENT; + goto out; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); @@ -2893,8 +2908,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans, root); - - return 0; +out: + mnt_drop_write_file(file); + return ret; } void btrfs_get_block_group_info(struct list_head *groups_list, -- cgit v1.2.1 From 198605a8e2077f174c9834c97b836f535e4e56dd Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:45 +0000 Subject: Btrfs: get write access when doing resize fs Steps to reproduce: # mkfs.btrfs # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs fi resize 10g We re-sized a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 10bc65ed736c..2be49b4c82d6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1298,12 +1298,13 @@ out_ra: return ret; } -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, +static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { u64 new_size; u64 old_size; u64 devid = 1; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; @@ -1318,6 +1319,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + ret = mnt_want_write_file(file); + if (ret) + return ret; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); @@ -1425,6 +1430,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3832,7 +3838,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, argp); + return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: -- cgit v1.2.1 From da24927b1e1925da5c1885cb483231dabe027e15 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:44:50 +0000 Subject: Btrfs: get write access when removing a device Steps to reproduce: # mkfs.btrfs -d single -m single # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs device delete We can remove a device from a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2be49b4c82d6..ee36009f8aa1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2270,20 +2270,23 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } @@ -2300,6 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3842,7 +3846,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, argp); + return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: -- cgit v1.2.1 From b8e95489bf0ddf767e4bd38f537e0adad16ee830 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:48:01 +0000 Subject: Btrfs: get write access for scrub We need get write access for scrub, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee36009f8aa1..12b18c01b911 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3127,10 +3127,11 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, return btrfs_wait_for_commit(root, transid); } -static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - int ret; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_scrub_args *sa; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3139,6 +3140,12 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); @@ -3146,6 +3153,9 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: kfree(sa); return ret; } @@ -3879,7 +3889,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: - return btrfs_ioctl_scrub(root, argp); + return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: -- cgit v1.2.1 From 905b0dda06a064db08b8a814e968786ff3c4cc19 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:50:11 +0000 Subject: Btrfs: get write access for qgroup operations We need get write access for qgroup operations, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 73 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12b18c01b911..657d83ca9dea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3558,8 +3558,9 @@ out: return ret; } -static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_quota_ctl_args *sa; struct btrfs_trans_handle *trans = NULL; int ret; @@ -3568,12 +3569,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { trans = btrfs_start_transaction(root, 2); @@ -3606,14 +3610,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (err && !ret) ret = err; } - out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3622,12 +3628,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3650,11 +3659,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3663,12 +3675,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3690,11 +3705,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3704,12 +3722,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3732,6 +3753,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } @@ -3907,13 +3930,13 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp); case BTRFS_IOC_QUOTA_CTL: - return btrfs_ioctl_quota_ctl(root, argp); + return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: - return btrfs_ioctl_qgroup_assign(root, argp); + return btrfs_ioctl_qgroup_assign(file, argp); case BTRFS_IOC_QGROUP_CREATE: - return btrfs_ioctl_qgroup_create(root, argp); + return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: - return btrfs_ioctl_qgroup_limit(root, argp); + return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); } -- cgit v1.2.1 From 9247f3170b2c3d648707c93bbebcd763fac17c06 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:24:43 +0000 Subject: Btrfs: use slabs for auto defrag allocation The auto defrag allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/file.c | 28 ++++++++++++++++++++++++---- fs/btrfs/super.c | 9 ++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 91ff078e85df..389c05715eaa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3505,6 +3505,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); /* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bd7f1b01e051..15117eae85c4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@ #include "compat.h" #include "volumes.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; /* * when auto defrag is enabled we * queue up these defrag structs to remember which @@ -127,7 +128,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, return; exists: - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return; } @@ -157,7 +158,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, else transid = BTRFS_I(inode)->root->last_trans; - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; @@ -169,7 +170,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } @@ -315,7 +316,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) next: spin_lock(&fs_info->defrag_inodes_lock); next_free: - kfree(defrag); + if (defrag) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); @@ -2293,3 +2295,21 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_ioctl, #endif }; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index def4f24b58df..99545df1b86c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1680,10 +1680,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_ordered_data; - err = btrfs_interface_init(); + err = btrfs_auto_defrag_init(); if (err) goto free_delayed_inode; + err = btrfs_interface_init(); + if (err) + goto free_auto_defrag; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1695,6 +1699,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); free_ordered_data: @@ -1714,6 +1720,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); extent_map_exit(); -- cgit v1.2.1 From 8ddc473433b5e8ce8693db9f6e251f5a28267528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:25:38 +0000 Subject: Btrfs: fix unprotected defragable inode insertion We forget to get the defrag lock when we re-add the defragable inode, Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 70 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15117eae85c4..00918321e390 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -91,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static void __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -119,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, entry->transid = defrag->transid; if (defrag->last_offset > entry->last_offset) entry->last_offset = defrag->last_offset; - goto exists; + return -EEXIST; } } set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return; + return 0; +} -exists: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return; +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + return 1; } /* @@ -143,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; u64 transid; + int ret; - if (!btrfs_test_opt(root, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(root->fs_info)) + if (!__need_auto_defrag(root)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -167,14 +171,50 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) - __btrfs_add_inode_defrag(inode, defrag); - else + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } +/* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + /* * must be called with the defrag_inodes lock held */ @@ -294,7 +334,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ if (num_defrag == defrag_batch) { defrag->last_offset = range.start; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); /* * we don't want to kfree defrag, we added it back to * the rbtree @@ -308,7 +348,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ defrag->last_offset = 0; defrag->cycled = 1; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); defrag = NULL; } -- cgit v1.2.1 From 26176e7c2aa923327becdc25b5aca2cb907ac932 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:26:20 +0000 Subject: Btrfs: restructure btrfs_run_defrag_inodes() This patch restructure btrfs_run_defrag_inodes() and make the code of the auto defragment more readable. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 +- fs/btrfs/file.c | 197 +++++++++++++++++++++++++++++------------------------ 3 files changed, 109 insertions(+), 91 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 389c05715eaa..6ba56aea5b62 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3510,6 +3510,7 @@ void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 76b82506bf92..3229531af8c3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3329,7 +3329,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); + btrfs_cleanup_defrag_inodes(fs_info); if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 00918321e390..3c6f7479cd5b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -216,11 +216,11 @@ out: } /* - * must be called with the defrag_inodes lock held + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, - u64 root, u64 ino, - struct rb_node **next) +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; @@ -231,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, tmp.ino = ino; tmp.root = root; - p = info->defrag_inodes.rb_node; + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -242,52 +243,128 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, else if (ret > 0) p = parent->rb_right; else - return entry; + goto out; } - if (next) { - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); - } - *next = parent; + else + entry = NULL; } - return NULL; +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; } -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + if (need_resched()) { + spin_unlock(&fs_info->defrag_inodes_lock); + cond_resched(); + spin_lock(&fs_info->defrag_inodes_lock); + } + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ struct btrfs_root *inode_root; struct inode *inode; - struct rb_node *n; struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; - u64 first_ino = 0; - u64 root_objectid = 0; int num_defrag; - int defrag_batch = 1024; + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode_root); + } + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode); + } + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; + range.start = defrag->last_offset; + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - spin_lock(&fs_info->defrag_inodes_lock); while(1) { - n = NULL; + if (!__need_auto_defrag(fs_info->tree_root)) + break; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, - first_ino, &n); + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); if (!defrag) { - if (n) { - defrag = rb_entry(n, struct inode_defrag, - rb_node); - } else if (root_objectid || first_ino) { + if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; @@ -296,71 +373,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } } - /* remove it from the rbtree */ first_ino = defrag->ino + 1; root_objectid = defrag->root; - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - - if (btrfs_fs_closing(fs_info)) - goto next_free; - spin_unlock(&fs_info->defrag_inodes_lock); - - /* get the inode */ - key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(inode_root)) - goto next; - - key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); - if (IS_ERR(inode)) - goto next; - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - range.start = defrag->last_offset; - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - defrag_batch); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == defrag_batch) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); - /* - * we don't want to kfree defrag, we added it back to - * the rbtree - */ - defrag = NULL; - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); - defrag = NULL; - } - - iput(inode); -next: - spin_lock(&fs_info->defrag_inodes_lock); -next_free: - if (defrag) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + __btrfs_run_defrag_inode(fs_info, defrag); } - spin_unlock(&fs_info->defrag_inodes_lock); - atomic_dec(&fs_info->defrag_running); /* -- cgit v1.2.1 From b66f00da0cfceb856c17706b77906b63437f6fda Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:27:29 +0000 Subject: Btrfs: fix freeze vs auto defrag If we freeze the fs, the auto defragment should not run. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3c6f7479cd5b..d415a052ca9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -318,8 +318,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); /* * if we filled the whole defrag batch, there * must be more work to do. Queue this defrag -- cgit v1.2.1 From cb3806ec88a7e1e9d1fbde34cbc0bf153b7e7c3f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 16:10:21 +0000 Subject: Btrfs: fix race in check-integrity caused by usage of bitfield The structure member mirror_num is modified concurrently to the structure member is_iodone. This doesn't require any locking by design, unless everything is stored in the same 32 bits of a bit field. This was the case and xfstest 284 was able to trigger false warnings from the checker code. This patch seperates the bits and fixes the race. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index badc6f141b6f..11d47bfb62b4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -137,7 +137,7 @@ struct btrfsic_block { unsigned int never_written:1; /* block was added because it was * referenced, not because it was * written */ - unsigned int mirror_num:2; /* large enough to hold + unsigned int mirror_num; /* large enough to hold * BTRFS_SUPER_MIRROR_MAX */ struct btrfsic_dev_state *dev_state; u64 dev_bytenr; /* key, physical byte num on disk */ -- cgit v1.2.1 From f9c83748deb94aba89b5c1085f887ddcdab223ef Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:50 +0000 Subject: Btrfs: fix a build warning for an unused label This issue was detected by the "0-DAY kernel build testing". fs/btrfs/volumes.c: In function 'btrfs_rm_device': fs/btrfs/volumes.c:1505:1: warning: label 'error_close' defined but not used [-Wunused-label] Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a4948b621c..0b1b7e22e7ea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1544,7 +1544,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) error_brelse: brelse(bh); -error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: -- cgit v1.2.1 From af1be4f851db4f4975f0139211a6561776ef37c0 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:51 +0000 Subject: Btrfs: fix a scrub regression in case of write errors This regression was introduced by the device-replace patches. Scrub immediately stops checking those disks that have write errors. This is nothing that happens in the real world, but it is wrong since scrub is the tool to detect and repair defects. Fix it. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30ba99724896..8db6a6413a5f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2657,7 +2657,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (atomic64_read(&dev_replace->num_write_errors) > 0) { + if (is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; } -- cgit v1.2.1 From 797f4277113bff142b6c64a55abaef64d7d67d5c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:07 +0000 Subject: Btrfs: use existing align macros in btrfs_allocate() The kernel developers have implemented some often-used align macros, we should use them instead of the complex code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d415a052ca9a..a43d0aef6ee1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2054,12 +2054,12 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2140,7 +2140,7 @@ static long btrfs_fallocate(struct file *file, int mode, } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && -- cgit v1.2.1 From 0ff6fabdb0a862b22df4dd75873578392478e64d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:54 +0000 Subject: Btrfs: fix off-by-one error of the reserved size of btrfs_allocate() alloc_end is not the real end of the current extent, it is the start of the next adjoining extent. So we needn't +1 when calculating the size the space that is about to be reserved. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a43d0aef6ee1..8e3d6788d6dd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2072,7 +2072,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; @@ -2179,7 +2179,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; } -- cgit v1.2.1 From 755ac67f83e515af55adbfe55134eb7d90839cdb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:11 +0000 Subject: Btrfs: skip adding an acl attribute if we don't have to If the acl can be exactly represented in the traditional file mode permission bits, we don't set another acl attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0c16e3dbfd56..e15d2b0d8d3b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; + if (ret == 0) + acl = NULL; } ret = 0; break; -- cgit v1.2.1 From 01e6deb25ae11e7b85484bf5e550eb540c50c63e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:12 +0000 Subject: Btrfs: don't add a NULL extended attribute Passing a null extended attribute value means to remove the attribute, but we don't have to add a new NULL extended attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/xattr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e9d384055494..aef6bb3c5f5c 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans, */ if (!value) goto out; + } else { + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (!di && !value) + goto out; + btrfs_release_path(path); } again: -- cgit v1.2.1 From 05dadc09f52ad5a631da1aa8767c5b80e121f0c4 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 29 Nov 2012 05:08:26 +0000 Subject: Btrfs: add fiemap's flag check When the flag not supported is specified, it is necessary to return the error to the caller. So, we add the validity check of the fiemap's flag. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d7bf2e7ee8a0..a1761f01cf11 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6595,9 +6595,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, btrfs_submit_direct, 0); } +#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + int ret; + + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + if (ret) + return ret; + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } -- cgit v1.2.1 From 2794ed013b3551cbae887ea1b93c52aaacb7370d Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a1761f01cf11..adab791e1ce9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4951,6 +4951,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } + err = btrfs_update_inode(trans, root, inode); + if (err) { + drop_inode = 1; + goto out_unlock; + } + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v1.2.1 From 43baa579b3b1f059f68c51ef754ec59c87a35745 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:09 +0000 Subject: Btrfs: refactor error handling to drop inode in btrfs_create() Refactor it by checking whether the inode has been created and needs to be dropped (drop_inode_on_err) and also if the err variable is set. That way the variable doesn't need to be set on each and every error handling block. Signed-off-by: Filipe Brandenburger Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index adab791e1ce9..657f16d9c78b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4989,7 +4989,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int drop_inode = 0; + int drop_inode_on_err = 0; int err; u64 objectid; u64 index = 0; @@ -5014,12 +5014,11 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); goto out_unlock; } + drop_inode_on_err = 1; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; + if (err) goto out_unlock; - } /* * If the active LSM wants to access the inode during @@ -5032,16 +5031,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - d_instantiate(dentry, inode); - } + goto out_unlock; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); + out_unlock: btrfs_end_transaction(trans, root); - if (drop_inode) { + if (err && drop_inode_on_err) { inode_dec_link_count(inode); iput(inode); } -- cgit v1.2.1 From 960097622d48bf0ee8f6c0cf751a904066c4b45b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 30 Nov 2012 06:30:14 +0000 Subject: Btrfs: use ctl->unit for free space calculation instead of block_group->sectorsize We should use ctl->unit for free space calculation instead of block_group->sectorsize even though for free space use_bitmap or free space cluster we only have sectorsize assigned to ctl->unit currently. Also, we can keep it consisten in code style. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index efdd1d3f441c..59ea2e4349c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1353,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); BUG_ON(ctl->total_bitmaps > max_bitmaps); @@ -1639,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * some block groups are so tiny they can't be enveloped by a bitmap, so * don't even bother to create a bitmap for this */ - if (BITS_PER_BITMAP * block_group->sectorsize > - block_group->key.offset) + if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) return false; return true; @@ -2287,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long total_found = 0; int ret; - i = offset_to_bit(entry->offset, block_group->sectorsize, + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); - want_bits = bytes_to_bits(bytes, block_group->sectorsize); - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); again: found_bits = 0; @@ -2314,23 +2313,22 @@ again: total_found += found_bits; - if (cluster->max_size < found_bits * block_group->sectorsize) - cluster->max_size = found_bits * block_group->sectorsize; + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; if (total_found < want_bits || cluster->max_size < cont1_bytes) { i = next_zero + 1; goto again; } - cluster->window_start = start * block_group->sectorsize + - entry->offset; + cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, - total_found * block_group->sectorsize, 1); + total_found * ctl->unit, 1); return 0; } -- cgit v1.2.1 From 543eabd5e1929bc73e22b279aa911eb01447535f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:52:48 +0000 Subject: Btrfs: don't auto defrag a file when doing directIO If we runt the direct IO, we should not run auto defrag, because it may introduce buffered IO vs direcIO problem, and make direct IO slow down. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 657f16d9c78b..bf609581c5d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5692,9 +5692,6 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (IS_ERR(trans)) return ERR_CAST(trans); - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) - btrfs_add_inode_defrag(trans, inode); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; alloc_hint = get_extent_allocation_hint(inode, start, len); -- cgit v1.2.1 From 4b5829a8e3104c8bc115d926a0285d3ff9bcfc77 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:25 +0000 Subject: Btrfs: fix missing reserved space release in error path of delalloc reservation We forget to release the reserved space in the error path of delalloc reservatiom, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 98af8379895a..e15280989188 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4559,6 +4559,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); if (ret) { + spin_lock(&BTRFS_I(inode)->lock); + calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4594,6 +4597,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + } mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } -- cgit v1.2.1 From 6347b3c433a4cff00eb2299c7f2c7d1d8b24c1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:45 +0000 Subject: Btrfs: fix off-by-one error of the same page check in btrfs_punch_hole() (start + len) is the start of the adjacent extent, not the end of the current extent, so we should not use it to check the hole is on the same page or not. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8e3d6788d6dd..d75412bf7c4a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1867,8 +1867,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - bool same_page = (offset >> PAGE_CACHE_SHIFT) == - ((offset + len) >> PAGE_CACHE_SHIFT); + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); btrfs_wait_ordered_range(inode, offset, len); -- cgit v1.2.1 From 0061280d2c7240805cfd7b1f493da967c97c2f34 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:12 +0000 Subject: Btrfs: fix the page that is beyond EOF Steps to reproduce: # mkfs.btrfs # mount # dd if=/dev/zero of=/ bs=512 seek=5 count=8 # fallocate -p -o 2048 -l 16384 / # dd if=/dev/zero of=/ bs=4096 seek=3 count=8 conv=notrunc,nocreat # umount # dmesg WARNING: at fs/btrfs/inode.c:7140 btrfs_destroy_inode+0x2eb/0x330 The reason is that we inputed a range which is beyond the end of the file. And because the end of this range was not page-aligned, we had to truncate the last page in this range, this operation is similar to a buffered file write. In other words, we reserved enough space and clear the data which was in the hole range on that page. But when we expanded that test file, write the data into the same page, we forgot that we have reserved enough space for the buffered write of that page because in most cases there is no page that is beyond the end of the file. As a result, we reserved the space twice. In fact, we needn't truncate the page if it is beyond the end of the file, just release the allocated space in that range. Fix the above problem by this way. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d75412bf7c4a..700ffd266da3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1859,9 +1859,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - u64 lockstart = (offset + mask) & ~mask; - u64 lockend = ((offset + len) & ~mask) - 1; + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + u64 lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; @@ -1896,10 +1896,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } if (lockend < lockstart) { -- cgit v1.2.1 From 7426cc04d407621773af3a0403e57642e40c36bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:52 +0000 Subject: Btrfs: punch hole past the end of the file Since we can pre-allocate the space past EOF, we should be able to reclaim that space if we need. This patch implements it by removing the EOF check. Though the manual of fallocate command says we can use truncate command to reclaim the pre-allocated space which past EOF, but because truncate command changes the file size, we must run several commands to reclaim the space if we don't want to change the file size, so it is not a good choice. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 700ffd266da3..71c2dc1ea15a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1873,26 +1873,28 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_wait_ordered_range(inode, offset, len); mutex_lock(&inode->i_mutex); - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return 0; - } - + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ /* * Only do this if we are in the same page and we aren't doing the * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - ret = btrfs_truncate_page(inode, offset, len, 0); + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - ret = btrfs_truncate_page(inode, offset, 0, 0); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } /* zero the front end of the last page */ -- cgit v1.2.1 From ac6a2b36f9fcfbe4865550afb6d333dec6b57578 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:56:13 +0000 Subject: Btrfs: fix wrong return value of btrfs_truncate_page() ret variant may be set to 0 if we read page successfully, but it might be released before we lock it again. On this case, if we fail to allocate a new page, we will return 0, it is wrong, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf609581c5d0..0446cbe8bcaf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3521,11 +3521,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if (ret) goto out; - ret = -ENOMEM; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + ret = -ENOMEM; goto out; } @@ -3574,7 +3574,6 @@ again: goto out_unlock; } - ret = 0; if (offset != PAGE_CACHE_SIZE) { if (!len) len = PAGE_CACHE_SIZE - offset; -- cgit v1.2.1 From b8b8ff590f99678616f9ea85f5088542d1cfc0be Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 6 Dec 2012 19:25:48 +0000 Subject: btrfs: Notify udev when removing device Currently udev does not know about the device being removed from the file system. This may result in the situation where we're unable to mount the file system by UUID or by LABEL because the by-uuid and by-label links may still point to the device which is no longer part of the btrfs file system and hence does not have any btrfs super block. It can be easily reproduced by the following: mkfs.btrfs -L bugfs /dev/loop[0-6] mount /dev/loop0 /mnt/test btrfs device delete /dev/loop0 /mnt/test umount /mnt/test mount LABEL=bugfs /mnt/test <---- this fails then see: ls -l /dev/disk/by-label/bugfs which will still point to the /dev/loop0 We did not noticed this before because libblkid would send the udev event for us when it notice that the link does not fit the reality, however it does not do that anymore and completely relies on udev information. Fix this by sending the KOBJ_CHANGE event to the bdev kobject after successful device removal. Note that this does not affect device addition, because we will open the device prior the addition from userspace and udev will notice that and reread the device afterwards. Signed-off-by: Lukas Czerner Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b1b7e22e7ea..886f4ba0f71d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -72,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } +static void btrfs_kobject_uevent(struct block_device *bdev, + enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + action, + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -1542,6 +1555,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + error_brelse: brelse(bh); if (bdev) -- cgit v1.2.1 From 5f3ab90a72f98adbf00c50ac2d4d2b47cf4a9685 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 7 Dec 2012 09:28:54 +0000 Subject: Btrfs: rename root_times_lock to root_item_lock Originally root_times_lock was introduced as part of send/receive code however newly developed patch to label the subvol reused the same lock, so renaming it for a meaningful name. Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/root-tree.c | 4 ++-- fs/btrfs/send.c | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5c2cf992e717..01efcbc80dfb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5114,13 +5114,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); trans = btrfs_join_transaction(left_root); if (IS_ERR(trans)) { @@ -5215,15 +5215,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); if (ctransid != left_start_ctransid) left_start_ctransid = 0; - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); if (ctransid != right_start_ctransid) right_start_ctransid = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ba56aea5b62..313a6adfde55 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1645,7 +1645,7 @@ struct btrfs_root { int force_cow; - spinlock_t root_times_lock; + spinlock_t root_item_lock; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3229531af8c3..faf182691b40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1204,7 +1204,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->root_key.objectid = objectid; root->anon_dev = 0; - spin_lock_init(&root->root_times_lock); + spin_lock_init(&root->root_item_lock); } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index eb923d087da7..668af537a3ea 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root_item *item = &root->root_item; struct timespec ct = CURRENT_TIME; - spin_lock(&root->root_times_lock); + spin_lock(&root->root_item_lock); item->ctransid = cpu_to_le64(trans->transid); item->ctime.sec = cpu_to_le64(ct.tv_sec); item->ctime.nsec = cpu_to_le32(ct.tv_nsec); - spin_unlock(&root->root_times_lock); + spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e78b297b0b00..54454542ad40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx) if (!path) return -ENOMEM; - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -4422,9 +4422,9 @@ join_trans: * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. */ - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); if (ctransid != start_ctransid) { WARN(1, KERN_WARNING "btrfs: the root that you're trying to " -- cgit v1.2.1 From e99761514999f64aff1985460967f93d9e8417f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 15:53:56 -0400 Subject: Btrfs: only log the inode item if we can get away with it Currently we copy all the file information into the log, inode item, the refs, xattrs etc. Except most of this doesn't change from fsync to fsync, just the inode item changes. So set a flag if an xattr changes or a link is added, and otherwise only log the inode item. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 10 ++++++++-- fs/btrfs/xattr.c | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ed8ca7ca5eff..2411baf35220 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -39,6 +39,7 @@ #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0446cbe8bcaf..123815f3b454 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5083,6 +5083,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 40b9efd20e43..f05fca778cb4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3429,14 +3429,20 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = BTRFS_INODE_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, - BTRFS_XATTR_ITEM_KEY); + max_key.type); } } if (ret) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index aef6bb3c5f5c..446a6848c554 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -208,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: -- cgit v1.2.1 From a95249b392c3ab843d7b25ab6817ecc9ea0b82ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:17:34 -0400 Subject: Btrfs: don't bother copying if we're only logging the inode We don't copy inode items anwyay, we just copy them straight into the log from the in memory inode. So if we know we're only logging the inode, don't bother dropping anything, just try to insert it and either if it succeeds or we get EEXIST we can update the inode item in the log and carry on. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f05fca778cb4..ab7168ee618f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2996,6 +2996,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, } +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_key key; + int ret; + + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + btrfs_release_path(path); + return 0; +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -3433,17 +3453,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); - } else { + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_ALL) fast_search = true; - if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) - max_key.type = BTRFS_XATTR_ITEM_KEY; - else - max_key.type = BTRFS_INODE_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; } + } if (ret) { err = ret; @@ -3522,6 +3549,7 @@ next_slot: ins_nr = 0; } +log_extents: if (fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.1 From b812ce28796f746f14ba6cc451250c422db447b2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Nov 2012 13:56:32 -0500 Subject: Btrfs: inline csums if we're fsyncing The tree logging stuff needs the csums to be on the ordered extents in order to log them properly, so mark that we're sync and inline the csum creation so we don't have to wait on the csumming to be done when logging extents that are still in flight. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 3 +++ fs/btrfs/file.c | 8 ++++++++ fs/btrfs/inode.c | 11 ++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2411baf35220..2a8c242bc4f5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -91,6 +91,9 @@ struct btrfs_inode { unsigned long runtime_flags; + /* Keep track of who's O_SYNC/fsycing currently */ + atomic_t sync_writers; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 71c2dc1ea15a..7f4654a15207 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1472,6 +1472,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); sb_start_write(inode->i_sb); @@ -1529,6 +1530,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } } + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1563,6 +1567,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -1613,7 +1619,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. */ + atomic_inc(&BTRFS_I(inode)->sync_writers); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 123815f3b454..7855aac36706 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1622,6 +1622,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; int metadata = 0; + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -1644,7 +1645,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto out; } goto mapit; - } else if (!skip_sum) { + } else if (async && !skip_sum) { /* csum items have already been cloned */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; @@ -1655,6 +1656,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; + } else if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + if (ret) + goto out; } mapit: @@ -6333,6 +6338,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (async_submit) + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + bio_get(bio); if (!write) { @@ -7113,6 +7121,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; + atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); -- cgit v1.2.1 From b493968096944a11422c4d80fb87af537ca1cac7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:31:19 -0500 Subject: Btrfs: keep track of the extents original block length If we've written to a prealloc extent we need to know the original block len for the extent. We can't figure this out currently since ->block_len is just set to the extent length. So introduce ->orig_block_len so that we know how many bytes were in the original extent for proper extent logging that future patches will need. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.h | 1 + fs/btrfs/file.c | 5 +++++ fs/btrfs/inode.c | 22 ++++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 679225555f7b..99a0dcb5ba2f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -24,6 +24,7 @@ struct extent_map { u64 mod_start; u64 mod_len; u64 orig_start; + u64 orig_block_len; u64 block_start; u64 block_len; u64 generation; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7f4654a15207..6810145f4e97 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -588,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -609,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; + split->orig_block_len = max(em->block_len, + em->orig_block_len); if (compressed) { split->block_len = em->block_len; @@ -1838,6 +1842,7 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7855aac36706..bfd59bcc50d7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -699,6 +699,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -886,6 +887,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -1143,6 +1145,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 extent_offset; u64 disk_bytenr; u64 num_bytes; + u64 disk_num_bytes; int extent_type; int ret, err; int type; @@ -1245,6 +1248,8 @@ next_slot: extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); + disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); if (extent_end <= start) { path->slots[0]++; goto next_slot; @@ -1319,6 +1324,7 @@ out_check: em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; + em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -3696,6 +3702,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -5374,6 +5381,8 @@ again: em->len = extent_end - extent_start; em->orig_start = extent_start - btrfs_file_extent_offset(leaf, item); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); bytenr = btrfs_file_extent_disk_bytenr(leaf, item); if (bytenr == 0) { em->block_start = EXTENT_MAP_HOLE; @@ -5383,8 +5392,7 @@ again: set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; em->block_start = bytenr; - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); + em->block_len = em->orig_block_len; } else { bytenr += btrfs_file_extent_offset(leaf, item); em->block_start = bytenr; @@ -5414,6 +5422,7 @@ again: em->start = extent_start + extent_offset; em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + em->orig_block_len = em->len; em->orig_start = EXTENT_MAP_INLINE; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -5721,6 +5730,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; /* @@ -5914,7 +5924,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - int type) + u64 orig_block_len, int type) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -5932,6 +5942,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_len = block_len; em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -6068,12 +6079,14 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_odirect(trans, inode, start, len) == 1) { u64 orig_start = em->start; + u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, orig_start, - block_start, len, type); + block_start, len, + orig_block_len, type); if (IS_ERR(em)) { btrfs_end_transaction(trans, root); goto unlock_err; @@ -7771,6 +7784,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->len = ins.offset; em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; -- cgit v1.2.1 From b11e234d21e73df94099e473a080bca502b9a496 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:58:15 -0500 Subject: Btrfs: do not mark ems as prealloc if we are writing to them We are going to use EM's to log extents in the future, so we need to not mark them as prealloc if they aren't actually prealloc extents. Instead mark them with FILLING so we know to ammend mod_start/mod_len and that way we don't confuse the extent logging code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++-- fs/btrfs/extent_map.h | 1 + fs/btrfs/inode.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f7..85ae2b6fe03b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -266,9 +266,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { prealloc = true; - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + clear_bit(EXTENT_FLAG_FILLING, &em->flags); } try_merge_map(tree, em); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 99a0dcb5ba2f..922943ce29e8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -14,6 +14,7 @@ #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfd59bcc50d7..73e6833dcc21 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1327,7 +1327,7 @@ out_check: em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -5945,7 +5945,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); do { btrfs_drop_extent_cache(inode, em->start, -- cgit v1.2.1 From d6393786cd40f67709324bc4f08d7e4b911153fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Dec 2012 17:00:01 -0500 Subject: Btrfs: add path->really_keep_locks You'd think path->keep_locks would keep all the locks wouldn't you? You'd be wrong. It only keeps them if the slot is pointing to the last item in the node. This is for use with btrfs_next_leaf, which needs this sort of thing. But the horrible horrible things I'm going to do to the tree log means I really need everything held from root to leaf so I can add and delete items in the same search. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 7 +++++-- fs/btrfs/ctree.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 01efcbc80dfb..0c5c28ff794f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2212,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; + if (path->really_keep_locks) + return; + for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2259,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks) + if (path->keep_locks || path->really_keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2492,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->keep_locks || p->lowest_level)) + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 313a6adfde55..9ed452f5d062 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -576,6 +576,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int really_keep_locks:1; }; /* -- cgit v1.2.1 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 133 ++++++++++++++++++++ fs/btrfs/ctree.h | 3 + fs/btrfs/extent_map.c | 20 ++- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 85 +++++-------- fs/btrfs/tree-log.c | 338 +++++++++++++++++++++++++++++--------------------- fs/btrfs/volumes.c | 1 + 7 files changed, 372 insertions(+), 210 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0c5c28ff794f..e8b32641ea90 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5490,6 +5490,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } +/* Release the path up to but not including the given level */ +static void btrfs_release_level(struct btrfs_path *path, int level) +{ + int i; + + for (i = 0; i < level; i++) { + path->slots[i] = 0; + if (!path->nodes[i]) + continue; + if (path->locks[i]) { + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } +} + +/* + * This function assumes 2 things + * + * 1) You are using path->keep_locks + * 2) You are not inserting items. + * + * If either of these are not true do not use this function. If you need a next + * leaf with either of these not being true then this function can be easily + * adapted to do that, but at the moment these are the limitations. + */ +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del) +{ + struct extent_buffer *b; + struct btrfs_key key; + u32 nritems; + int level = 1; + int slot; + int ret = 1; + int write_lock_level = BTRFS_MAX_LEVEL; + int ins_len = del ? -1 : 0; + + WARN_ON(!(path->keep_locks || path->really_keep_locks)); + + nritems = btrfs_header_nritems(path->nodes[0]); + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + while (path->nodes[level]) { + nritems = btrfs_header_nritems(path->nodes[level]); + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { +search: + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, + ins_len, 1); + if (ret < 0) + goto out; + level = 1; + continue; + } + + if (path->slots[level] >= nritems - 1) { + level++; + continue; + } + + btrfs_release_level(path, level); + break; + } + + if (!path->nodes[level]) { + ret = 1; + goto out; + } + + path->slots[level]++; + b = path->nodes[level]; + + while (b) { + level = btrfs_header_level(b); + + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(path); + ret = btrfs_cow_block(trans, root, b, + path->nodes[level + 1], + path->slots[level + 1], &b); + if (ret) + goto out; +cow_done: + path->nodes[level] = b; + btrfs_clear_path_blocking(path, NULL, 0); + if (level != 0) { + ret = setup_nodes_for_search(trans, root, path, b, + level, ins_len, + &write_lock_level); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + + b = path->nodes[level]; + slot = path->slots[level]; + + ret = read_block_for_search(trans, root, path, + &b, level, slot, &key, 0); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + level = btrfs_header_level(b); + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(path, b, + BTRFS_WRITE_LOCK); + } + path->locks[level] = BTRFS_WRITE_LOCK; + path->nodes[level] = b; + path->slots[level] = 0; + } else { + path->slots[level] = 0; + ret = 0; + break; + } + } + +out: + if (ret) + btrfs_release_path(path); + + return ret; +} + int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ed452f5d062..55aff6764bb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3187,6 +3187,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 85ae2b6fe03b..fff2c28497b6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) struct extent_map *alloc_extent_map(void) { struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; em->in_tree = 0; @@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(merge, em)) { em->start = merge->start; + em->orig_start = merge->orig_start; em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_start = em->start; - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + list_move(&em->list, &tree->modified_extents); list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); @@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); list_del_init(&merge->list); free_extent_map(merge); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6810145f4e97..c56088ece500 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -621,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } else { split->block_len = split->len; split->block_start = em->block_start + diff; - split->orig_start = split->start; + split->orig_start = em->orig_start; } ret = add_extent_mapping(em_tree, split); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73e6833dcc21..355a297e7988 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -95,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, int type); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -704,10 +708,14 @@ retry: em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -890,10 +898,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1320,7 +1332,7 @@ out_check: em = alloc_extent_map(); BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; - em->orig_start = em->start; + em->orig_start = found_key.offset - extent_offset; em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; @@ -1328,9 +1340,13 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -5371,6 +5387,7 @@ again: if (start + len <= found_key.offset) goto not_found; em->start = start; + em->orig_start = start; em->len = found_key.offset - start; goto not_found_em; } @@ -5423,7 +5440,7 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_block_len = em->len; - em->orig_start = EXTENT_MAP_INLINE; + em->orig_start = em->start; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; @@ -5476,6 +5493,7 @@ again: } not_found: em->start = start; + em->orig_start = start; em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; @@ -5677,30 +5695,14 @@ out: } static struct extent_map *btrfs_new_extent_direct(struct inode *inode, - struct extent_map *em, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - bool insert = false; - - /* - * Ok if the extent map we looked up is a hole and is for the exact - * range we want, there is no reason to allocate a new one, however if - * it is not right then we need to free this one and drop the cache for - * our range. - */ - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || - em->len != len) { - free_extent_map(em); - em = NULL; - insert = true; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); - } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -5716,38 +5718,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, goto out; } - if (!em) { - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - } - - em->start = start; - em->orig_start = em->start; - em->len = ins.offset; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - - /* - * We need to do this because if we're using the original em we searched - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. - */ - em->flags = 0; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - while (insert) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); - } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, 0); + if (IS_ERR(em)) + goto out; ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); @@ -5943,6 +5917,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; + em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_FILLING, &em->flags); @@ -5952,6 +5927,9 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); } while (ret == -EEXIST); @@ -6078,7 +6056,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { - u64 orig_start = em->start; + u64 orig_start = em->orig_start; u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { @@ -6110,7 +6088,8 @@ must_cow: * it above */ len = bh_result->b_size; - em = btrfs_new_extent_direct(inode, em, start, len); + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab7168ee618f..72444811d275 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3150,145 +3150,220 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -struct log_args { - struct extent_buffer *src; - u64 next_offset; - int start_slot; - int nr; -}; +static int drop_adjacent_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct extent_map *em, + struct btrfs_path *path) +{ + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_key key, new_key; + struct btrfs_map_token token; + u64 extent_end; + u64 extent_offset = 0; + int extent_type; + int del_slot = 0; + int del_nr = 0; + int ret = 0; + + while (1) { + btrfs_init_map_token(&token); + leaf = path->nodes[0]; + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + if (del_nr) { + ret = btrfs_del_items(trans, root, path, + del_slot, del_nr); + if (ret) + return ret; + del_nr = 0; + } + + ret = btrfs_next_leaf_write(trans, root, path, 1); + if (ret < 0) + return ret; + if (ret > 0) + return 0; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY || + key.offset >= em->start + em->len) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_offset = btrfs_token_file_extent_offset(leaf, + fi, &token); + extent_end = key.offset + + btrfs_token_file_extent_num_bytes(leaf, fi, + &token); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + BUG(); + } + + if (extent_end <= em->len + em->start) { + if (!del_nr) { + del_slot = path->slots[0]; + } + del_nr++; + continue; + } + + /* + * Ok so we'll ignore previous items if we log a new extent, + * which can lead to overlapping extents, so if we have an + * existing extent we want to adjust we _have_ to check the next + * guy to make sure we even need this extent anymore, this keeps + * us from panicing in set_item_key_safe. + */ + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(leaf, &tmp_key, + path->slots[0] + 1); + if (tmp_key.objectid == btrfs_ino(inode) && + tmp_key.type == BTRFS_EXTENT_DATA_KEY && + tmp_key.offset <= em->start + em->len) { + if (!del_nr) + del_slot = path->slots[0]; + del_nr++; + continue; + } + } + + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = em->start + em->len; + btrfs_set_item_key_safe(trans, root, path, &new_key); + extent_offset += em->start + em->len - key.offset; + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - + (em->start + em->len), + &token); + btrfs_mark_buffer_dirty(leaf); + } + + if (del_nr) + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + + return ret; +} static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct btrfs_path *dst_path, struct log_args *args) + struct extent_map *em, struct btrfs_path *path) { struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct list_head ordered_sums; struct btrfs_key key; - u64 start = em->mod_start; - u64 search_start = start; - u64 len = em->mod_len; - u64 num_bytes; - int nritems; + u64 csum_offset = em->mod_start - em->start; + u64 csum_len = em->mod_len; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; int ret; + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (BTRFS_I(inode)->logged_trans == trans->transid) { - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, NULL, 0); - if (ret) - return ret; + INIT_LIST_HEAD(&ordered_sums); + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + path->really_keep_locks = 1; + + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + if (ret && ret != -EEXIST) { + path->really_keep_locks = 0; + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, em->generation); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + skip_csum = true; + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC); + } else { + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + if (em->block_start == 0) + skip_csum = true; + } + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else { + btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); } - while (len) { - if (args->nr) - goto next_slot; -again: - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = search_start; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret) { - /* - * A rare case were we can have an em for a section of a - * larger extent so we need to make sure that this em - * falls within the extent we've found. If not we just - * bail and go back to ye-olde way of doing things but - * it happens often enough in testing that we need to do - * this dance to make sure. - */ - do { - if (path->slots[0] == 0) { - btrfs_release_path(path); - if (search_start == 0) - return -ENOENT; - search_start--; - goto again; - } + btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); + btrfs_set_file_extent_num_bytes(leaf, fi, em->len); + btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); + btrfs_set_file_extent_compression(leaf, fi, em->compress_type); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - btrfs_release_path(path); - return -ENOENT; - } - } while (key.offset > start); + /* + * Have to check the extent to the right of us to make sure it doesn't + * fall in our current range. We're ok if the previous extent is in our + * range since the recovery stuff will run us in key order and thus just + * drop the part we overwrote. + */ + ret = drop_adjacent_extents(trans, log, inode, em, path); + btrfs_release_path(path); + path->really_keep_locks = 0; + if (ret) { + return ret; + } - num_bytes = btrfs_file_extent_length(path); - if (key.offset + num_bytes <= start) { - btrfs_release_path(path); - return -ENOENT; - } - } - args->src = path->nodes[0]; -next_slot: - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - num_bytes = btrfs_file_extent_length(path); - if (args->nr && - args->start_slot + args->nr == path->slots[0]) { - args->nr++; - } else if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 1; - args->start_slot = path->slots[0]; - } else if (!args->nr) { - args->nr = 1; - args->start_slot = path->slots[0]; - } - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (len < num_bytes) { - /* I _think_ this is ok, envision we write to a - * preallocated space that is adjacent to a previously - * written preallocated space that gets merged when we - * mark this preallocated space written. If we do not - * have the adjacent extent in cache then when we copy - * this extent it could end up being larger than our EM - * thinks it is, which is a-ok, so just set len to 0. - */ - len = 0; - } else { - len -= num_bytes; - } - start = key.offset + num_bytes; - args->next_offset = start; - search_start = start; + if (skip_csum) + return 0; - if (path->slots[0] < nritems) { - if (len) - goto next_slot; - break; - } + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; - if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 0; - btrfs_release_path(path); - } + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); } - return 0; + return ret; } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *path) { - struct log_args args; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -3297,8 +3372,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - memset(&args, 0, sizeof(args)); - write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -3331,34 +3404,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); - /* - * If the previous EM and the last extent we left off on aren't - * sequential then we need to copy the items we have and redo - * our search - */ - if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, - LOG_INODE_ALL); - if (ret) { - free_extent_map(em); - write_lock(&tree->lock); - continue; - } - btrfs_release_path(path); - args.nr = 0; - } - - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + ret = log_one_extent(trans, inode, root, em, path); free_extent_map(em); write_lock(&tree->lock); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - if (!ret && args.nr) - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); return ret; } @@ -3551,10 +3603,8 @@ next_slot: log_extents: if (fast_search) { - btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_changed_extents(trans, root, inode, path, - dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 886f4ba0f71d..d79b5b620e94 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4983,6 +4983,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em->bdev = (struct block_device *)map; em->start = logical; em->len = length; + em->orig_start = 0; em->block_start = 0; em->block_len = em->len; -- cgit v1.2.1 From bb146eb265091f472ada52a3419d41e9b0ff1f7d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:30:43 -0400 Subject: Btrfs: move checks in set_page_dirty under DEBUG This is a high traffic function, let's try and do as little as possible during normal operations shall we? Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faf182691b40..b8f7f04a6407 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1001,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) static int btree_set_page_dirty(struct page *page) { +#ifdef DEBUG struct extent_buffer *eb; BUG_ON(!PagePrivate(page)); @@ -1009,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_locked(eb); +#endif return __set_page_dirty_nobuffers(page); } -- cgit v1.2.1 From ed7b63eb8afd0bb8d978a23184d70c105b54aa26 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:33:54 -0400 Subject: Btrfs: only clear dirty on the buffer if it is marked as dirty No reason to set the path blocking or loop through all of the pages if the extent buffer isn't actually marked dirty. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8f7f04a6407..65f03670a952 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1142,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->dirty_metadata_bytes); } spin_unlock(&root->fs_info->delalloc_lock); - } - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); + } } } -- cgit v1.2.1 From ad9145596986b672d8c8235c92ed5307f82d045d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:39:33 -0400 Subject: Btrfs: don't memset new tokens Our token logic depends on token->kaddr being set, and if it is not it sets everything properly as needed. So instead of memsetting just set token->kaddr to NULL. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 55aff6764bb9..cd02205f13c8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1860,7 +1860,7 @@ struct btrfs_map_token { static inline void btrfs_init_map_token (struct btrfs_map_token *token) { - memset(token, 0, sizeof(*token)); + token->kaddr = NULL; } /* some macros to generate set/get funcs for the struct fields. This -- cgit v1.2.1 From 41be1f3b40b87de33cd2e7463dce88596dbdccc4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:43:18 -0400 Subject: Btrfs: optimize leaf_space_used This gets called at least 4 times for every level while adding an object, and it involves 3 kmapping calls, which on my box take about 5us a piece. So instead use a token, which brings us down to 1 kmap call and makes this function take 1/3 of the time per call. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e8b32641ea90..e7bea1d5f75f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3298,14 +3298,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(l, start); + end_item = btrfs_item_nr(l, end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; -- cgit v1.2.1 From 0b1c6ccadee4ea4adb98799f3430fc72e57a187f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Oct 2012 16:03:44 -0400 Subject: Btrfs: use tokens where we can in the tree log If we are syncing over and over the overhead of doing all those maps in fill_inode_item and log_changed_extents really starts to hurt, so use map tokens so we can avoid all the extra mapping. Since the token maps from our offset to the end of the page make sure to set the first thing in the item first so we really only do one map. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 127 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 72444811d275..83186c7e45d4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (log_inode_only) { /* set the generation to zero so the recover code @@ -2986,14 +2962,43 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_inode_generation(leaf, item, 0); - btrfs_set_inode_size(leaf, item, 0); + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, 0, &token); } else { - btrfs_set_inode_generation(leaf, item, - BTRFS_I(inode)->generation); - btrfs_set_inode_size(leaf, item, inode->i_size); - } - + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3267,6 +3272,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct list_head ordered_sums; + struct btrfs_map_token token; struct btrfs_key key; u64 csum_offset = em->mod_start - em->start; u64 csum_len = em->mod_len; @@ -3276,6 +3282,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); + btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; @@ -3289,37 +3296,49 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, em->generation); + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { skip_csum = true; - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); } else { - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); if (em->block_start == 0) skip_csum = true; } block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else { - btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); - } - - btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); - btrfs_set_file_extent_num_bytes(leaf, fi, em->len); - btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); - btrfs_set_file_extent_compression(leaf, fi, em->compress_type); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, + em->start - em->orig_start, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); /* -- cgit v1.2.1 From 5124e00ec5b0be56155a11aec416fcc5125339f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Nov 2012 13:44:13 -0500 Subject: Btrfs: only unlock and relock if we have to I noticed while doing fsync tests that we were always dropping the path and re-searching when we first cow the log root even though we've already gotten the write lock on the root. That's because we don't take into account that there might not be a parent node, so fix the check to make sure there is actually a parent node before we undo all of this work for nothing. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e7bea1d5f75f..c7b67cf24bba 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,7 +2564,10 @@ again: * must have write locks on this node and the * parent */ - if (level + 1 > write_lock_level) { + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { write_lock_level = level + 1; btrfs_release_path(p); goto again; -- cgit v1.2.1 From 6c760c072403f446ff829ec9e89568943a3c2ef2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Nov 2012 10:53:21 -0500 Subject: Btrfs: do not call file_update_time in aio_write This starts a transaction and dirties the inode everytime we call it, which is super expensive if you have a write heavy workload. We will be updating the inode when the IO completes and we reserve the space for the inode update when we reserve space for the write, so there is no chance of loss of information or enospc issues. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/inode.c | 42 ++++++++++++++++++------------------------ 2 files changed, 48 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c56088ece500..20452c110d7d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1464,6 +1464,24 @@ out: return written ? written : err; } +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static ssize_t btrfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) @@ -1519,11 +1537,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = file_update_time(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { @@ -1563,8 +1583,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * this will either be one more than the running transaction * or the generation used for the next transaction if there isn't * one running right now. + * + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0 || num_written == -EIOCBQUEUED) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 355a297e7988..1673dbdf1f76 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1922,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); goto out; } @@ -1986,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - } else { - btrfs_set_inode_last_trans(trans, inode); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } ret = 0; out_unlock: -- cgit v1.2.1 From 4ded4f639533ed5f02a0f0ab20d43bb9659c91f8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 14 Nov 2012 18:57:29 +0000 Subject: Btrfs: fix BUG() in scrub when first superblock reading gives EIO This fixes a very special case that can be reproduced by just disconnecting a disk at runtime, and without unmounting the filesystem first, start scrub on the filesystem with the disconnected disk. All read and write EIOs are handled correctly, only the first superblock is an exception and gives a BUG() in a subfunction. The BUG() is correct, it would crash later otherwise. The subfunction must not be called for superblocks and this is what the fix changes. Reported-by: Joeri Vanthienen Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8db6a6413a5f..bdbb94f245c9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -785,6 +785,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0]->logical; generation = sblock_to_check->pagev[0]->generation; -- cgit v1.2.1 From 31e502298d80e2af9001d17dc419a3fd4b0bebef Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 21 Nov 2012 14:18:10 +0000 Subject: Btrfs: put raid properties into global table Raid properties can be shared among raid calculation code, we can put them into a global table to keep it simple. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/volumes.c | 46 ++++++++++++++++------------------------------ fs/btrfs/volumes.h | 9 +++++++++ 4 files changed, 29 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd02205f13c8..44d9bc87e863 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3077,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e15280989188..b9526f749049 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5479,7 +5479,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int __get_block_group_index(u64 flags) +int __get_raid_index(u64 flags) { int index; @@ -5499,7 +5499,7 @@ static int __get_block_group_index(u64 flags) static int get_block_group_index(struct btrfs_block_group_cache *cache) { - return __get_block_group_index(cache->flags); + return __get_raid_index(cache->flags); } enum btrfs_loop_type { @@ -7441,7 +7441,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ target = get_restripe_target(root->fs_info, block_group->flags); if (target) { - index = __get_block_group_index(extended_to_chunk(target)); + index = __get_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d79b5b620e94..5cce6aa74012 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3491,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } +struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + { 2, 1, 0, 4, 2, 2 /* raid10 */ }, + { 1, 1, 2, 2, 2, 2 /* raid1 */ }, + { 1, 2, 1, 1, 1, 2 /* dup */ }, + { 1, 1, 0, 2, 1, 1 /* raid0 */ }, + { 1, 1, 0, 1, 1, 1 /* single */ }, +}; + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3520,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int ndevs; int i; int j; + int index; BUG_ON(!alloc_profile_is_valid(type, 0)); if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; - sub_stripes = 1; - dev_stripes = 1; - devs_increment = 1; - ncopies = 1; - devs_max = 0; /* 0 == as many as possible */ - devs_min = 1; + index = __get_raid_index(type); - /* - * define the properties of each RAID type. - * FIXME: move this to a global table and use it in all RAID - * calculation code - */ - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - dev_stripes = 2; - ncopies = 2; - devs_max = 1; - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - devs_increment = 2; - ncopies = 2; - devs_max = 2; - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - sub_stripes = 2; - devs_increment = 2; - ncopies = 2; - devs_min = 4; - } else { - devs_max = 1; - } + sub_stripes = btrfs_raid_array[index].sub_stripes; + dev_stripes = btrfs_raid_array[index].dev_stripes; + devs_max = btrfs_raid_array[index].devs_max; + devs_min = btrfs_raid_array[index].devs_min; + devs_increment = btrfs_raid_array[index].devs_increment; + ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37d0157167b0..d3c3939ac751 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -180,6 +180,15 @@ struct btrfs_device_info { u64 total_avail; }; +struct btrfs_raid_attr { + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ +}; + struct map_lookup { u64 type; int io_align; -- cgit v1.2.1 From 9185aa587b7425f8f4520da2e66792f5f3c2b815 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1673dbdf1f76..2e6918c85b72 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5041,6 +5041,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock; + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v1.2.1 From 1135d6df222046a0ec14a1c9335de99907879922 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:46:43 -0500 Subject: Btrfs: fix autodefrag and umount lockup This happens because writeback_inodes_sb_nr_if_idle does down_read. This doesn't work for us and it has not been fixed upstream yet, so do it ourselves and use that instead so we can stop having this stupid long standing lockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b9526f749049..afc3ac5e57d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3689,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root, return 0; } +static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, + unsigned long nr_pages, + enum wb_reason reason) +{ + if (!writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, reason); + up_read(&sb->s_umount); + return 1; + } + + return 0; +} + /* * shrink metadata reservation for delalloc */ @@ -3721,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, + nr_pages, + WB_REASON_FS_FREE_SPACE); /* * We need to wait for the async pages to actually start before -- cgit v1.2.1 From c64c2bd890df3b9a66c52c33df110777058c011e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:48:14 -0500 Subject: Btrfs: don't take inode delalloc mutex if we're a free space inode This confuses and angers lockdep even though it's ok. We don't really need the lock for free space inodes since only the transaction committer will be reserving space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index afc3ac5e57d7..d133edfcd449 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4535,16 +4535,25 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; + bool delalloc_lock = true; - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(inode)) + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } if (flush != BTRFS_RESERVE_NO_FLUSH && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4577,7 +4586,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } } @@ -4616,7 +4626,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4628,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(root->fs_info,"delalloc", -- cgit v1.2.1 From c36575e663e302dbaa4d16b9c72d2c9a913a9aef Mon Sep 17 00:00:00 2001 From: Forrest Liu Date: Mon, 17 Dec 2012 09:55:39 -0500 Subject: ext4: fix extent tree corruption caused by hole punch When depth of extent tree is greater than 1, logical start value of interior node is not correctly updated in ext4_ext_rm_idx. Signed-off-by: Forrest Liu Signed-off-by: "Theodore Ts'o" Reviewed-by: Ashish Sangwan Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26af22832a84..5ae1674ec12f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2226,13 +2226,14 @@ errout: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ - path--; + depth--; + path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); @@ -2257,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + + while (--depth >= 0) { + if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + break; + path--; + err = ext4_ext_get_access(handle, inode, path); + if (err) + break; + path->p_idx->ei_block = (path+1)->p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path); + if (err) + break; + } return err; } @@ -2599,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) - err = ext4_ext_rm_idx(handle, inode, path + depth); + err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; @@ -2802,7 +2816,7 @@ again: /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ - err = ext4_ext_rm_idx(handle, inode, path + i); + err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); -- cgit v1.2.1 From 9c52057c698fb96f8f07e7a4bcf4801a092bda89 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Dec 2012 14:26:57 -0500 Subject: Btrfs: fix hash overflow handling The handling for directory crc hash overflows was fairly obscure, split_leaf returns EOVERFLOW when we try to extend the item and that is supposed to bubble up to userland. For a while it did so, but along the way we added better handling of errors and forced the FS readonly if we hit IO errors during the directory insertion. Along the way, we started testing only for EEXIST and the EOVERFLOW case was dropped. The end result is that we may force the FS readonly if we catch a directory hash bucket overflow. This fixes a few problem spots. First I add tests for EOVERFLOW in the places where we can safely just return the error up the chain. btrfs_rename is harder though, because it tries to insert the new directory item only after it has already unlinked anything the rename was going to overwrite. Rather than adding very complex logic, I added a helper to test for the hash overflow case early while it is still safe to bail out. Snapshot and subvolume creation had a similar problem, so they are using the new helper now too. Signed-off-by: Chris Mason Reported-by: Pascal Junod --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/dir-item.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 24 +++++++++++++++++++- fs/btrfs/ioctl.c | 10 +++++++++ fs/btrfs/transaction.c | 2 +- 5 files changed, 95 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 44d9bc87e863..547b7b05727f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3283,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d0696f..502c2158167c 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ + if (ret > 0) { + ret = 0; + goto out; + } + + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + /* * lookup a directory item based on index. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2e6918c85b72..e95b1f90a1f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4885,7 +4885,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - if (ret == -EEXIST) + if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -7336,6 +7336,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, + new_dentry->d_name.name, + new_dentry->d_name.len); + + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (!new_inode) { + WARN_ON(1); + return ret; + } + } else { + /* maybe -EOVERFLOW */ + return ret; + } + } + ret = 0; + /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 657d83ca9dea..d4608ab72b79 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -710,6 +710,16 @@ static noinline int btrfs_mksubvol(struct path *parent, if (error) goto out_dput; + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, name, + namelen); + if (error) + goto out_dput; + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e6509b92433b..87fac9a21ea5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1190,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode, &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST); + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, root, ret); goto fail; -- cgit v1.2.1 From 213490b301773ea9c6fb89a86424a6901fcdd069 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 11 Sep 2012 08:33:50 -0600 Subject: Btrfs: fix a bug of per-file nocow Users report a bug, the reproducer is: $ mkfs.btrfs /dev/loop0 $ mount /dev/loop0 /mnt/btrfs/ $ mkdir /mnt/btrfs/dir $ chattr +C /mnt/btrfs/dir/ $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=10; $ lsattr /mnt/btrfs/dir/foo ---------------C- /mnt/btrfs/dir/foo $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 1 extent found ---> an extent $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=1 seek=5 conv=notrunc,nocreat; sync $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 3 extents found ---> with nocow, btrfs breaks the extent into three parts The new created file should not only inherit the NODATACOW flag, but also honor NODATASUM flag, because we must do COW on a file extent with checksum. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e95b1f90a1f6..67ed24ae86bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4818,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW) || - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) + if (btrfs_test_opt(root, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4608ab72b79..7624212ae926 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -141,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; } - if (flags & BTRFS_INODE_NODATACOW) + if (flags & BTRFS_INODE_NODATACOW) { BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } btrfs_update_iflags(inode); } -- cgit v1.2.1 From 9b3234b9220aae5387b60bc35a424ab6748b2b59 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 4 Dec 2012 18:03:46 -0500 Subject: nfsd4: disable zero-copy on non-final read ops To ensure ordering of read data with any following operations, turn off zero copy if the read is not the final operation in the compound. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 11 +++++++++++ fs/nfsd/nfs4state.c | 8 -------- fs/nfsd/xdr4.h | 8 ++++++++ 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bd67f4d6dfc6..2a2d9b06a413 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -692,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; + /* + * If we do a zero copy read, then a client will see read data + * that reflects the state of the file *after* performing the + * following compound. + * + * To ensure proper ordering, we therefore turn off zero copy if + * the client wants us to do more in this compound: + */ + if (!nfsd4_last_compound_op(rqstp)) + rqstp->rq_splice_ok = false; + nfs4_lock_state(); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8e2555112966..8e127b39d323 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1855,14 +1855,6 @@ out_free_session: goto out; } -static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) -{ - struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct nfsd4_compoundargs *argp = rqstp->rq_argp; - - return argp->opcnt == resp->opcnt; -} - static __be32 nfsd4_map_bcts_dir(u32 *dir) { switch (*dir) { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 331f8a3277ab..0889bfb43dc9 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -528,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) || nfsd4_is_solo_sequence(resp); } +static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + return argp->opcnt == resp->opcnt; +} + #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) static inline void -- cgit v1.2.1 From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/bad_inode.c | 2 +- fs/block_dev.c | 4 ++-- fs/btrfs/file.c | 16 ++++++++-------- fs/ceph/dir.c | 4 ++-- fs/ceph/file.c | 6 +++--- fs/cifs/cifsfs.c | 8 ++++---- fs/configfs/dir.c | 4 ++-- fs/ext3/dir.c | 6 +++--- fs/ext4/dir.c | 6 +++--- fs/ext4/file.c | 22 +++++++++++----------- fs/fuse/file.c | 8 ++++---- fs/gfs2/file.c | 10 +++++----- fs/libfs.c | 4 ++-- fs/nfs/dir.c | 6 +++--- fs/nfs/file.c | 10 +++++----- fs/ocfs2/extent_map.c | 12 ++++++------ fs/ocfs2/file.c | 6 +++--- fs/pstore/inode.c | 6 +++--- fs/read_write.c | 40 ++++++++++++++++++++-------------------- fs/seq_file.c | 4 ++-- fs/ubifs/dir.c | 4 ++-- 21 files changed, 94 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index b1342ffb3cf6..922ad460bff9 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -16,7 +16,7 @@ #include -static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence) { return -EIO; } diff --git a/fs/block_dev.c b/fs/block_dev.c index ab3a456f6650..172f8491a2bd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -321,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, * for a block special file file->f_path.dentry->d_inode->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ -static loff_t block_llseek(struct file *file, loff_t offset, int origin) +static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = file->f_mapping->host; loff_t size; @@ -331,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) size = i_size_read(bd_inode); retval = -EINVAL; - switch (origin) { + switch (whence) { case SEEK_END: offset += size; break; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a8ee75cb96ee..9c6673a9231f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2120,7 +2120,7 @@ out: return ret; } -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em; @@ -2154,7 +2154,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) * before the position we want in case there is outstanding delalloc * going on here. */ - if (origin == SEEK_HOLE && start != 0) { + if (whence == SEEK_HOLE && start != 0) { if (start <= root->sectorsize) em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, root->sectorsize, 0); @@ -2188,13 +2188,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { *offset = start; free_extent_map(em); break; } } else { - if (origin == SEEK_DATA) { + if (whence == SEEK_DATA) { if (em->block_start == EXTENT_MAP_DELALLOC) { if (start >= inode->i_size) { free_extent_map(em); @@ -2231,16 +2231,16 @@ out: return ret; } -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek(file, offset, origin); + offset = generic_file_llseek(file, offset, whence); goto out; case SEEK_DATA: case SEEK_HOLE: @@ -2249,7 +2249,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) return -ENXIO; } - ret = find_desired_extent(inode, &offset, origin); + ret = find_desired_extent(inode, &offset, whence); if (ret) { mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e5b77319c97b..8c1aabe93b67 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -454,7 +454,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->flags &= ~CEPH_F_ATEND; } -static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; @@ -463,7 +463,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); retval = -EINVAL; - switch (origin) { + switch (whence) { case SEEK_END: offset += inode->i_size + 2; /* FIXME */ break; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5840d2aaed15..d4dfdcf76d7f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -797,7 +797,7 @@ out: /* * llseek. be sure to verify file size on SEEK_END. */ -static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; @@ -805,7 +805,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; @@ -813,7 +813,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) } } - switch (origin) { + switch (whence) { case SEEK_END: offset += inode->i_size; break; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 210f0af83fc4..ce9f3c5421bf 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -695,13 +695,13 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return written; } -static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) +static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) { /* - * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate * the cached file length */ - if (origin != SEEK_SET && origin != SEEK_CUR) { + if (whence != SEEK_SET && whence != SEEK_CUR) { int rc; struct inode *inode = file->f_path.dentry->d_inode; @@ -728,7 +728,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (rc < 0) return (loff_t)rc; } - return generic_file_llseek(file, offset, origin); + return generic_file_llseek(file, offset, whence); } static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7414ae24a79b..712b10f64c70 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1613,12 +1613,12 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir return 0; } -static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) +static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; mutex_lock(&dentry->d_inode->i_mutex); - switch (origin) { + switch (whence) { case 1: offset += file->f_pos; case 0: diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index c8fff930790d..dd91264ba94f 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -296,17 +296,17 @@ static inline loff_t ext3_get_htree_eof(struct file *filp) * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) * will be invalid once the directory was converted into a dx directory */ -loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); loff_t htree_max = ext3_get_htree_eof(file); if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else - return generic_file_llseek(file, offset, origin); + return generic_file_llseek(file, offset, whence); } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b8d877f6c1fa..80a28b297279 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -333,17 +333,17 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) * * For non-htree, ext4_llseek already chooses the proper max offset. */ -loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); loff_t htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else - return ext4_llseek(file, offset, origin); + return ext4_llseek(file, offset, whence); } /* diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b64a60bf105a..d07c27ca594a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -303,7 +303,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * page cache has data or not. */ static int ext4_find_unwritten_pgoff(struct inode *inode, - int origin, + int whence, struct ext4_map_blocks *map, loff_t *offset) { @@ -333,10 +333,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); if (nr_pages == 0) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) break; - BUG_ON(origin != SEEK_HOLE); + BUG_ON(whence != SEEK_HOLE); /* * If this is the first time to go into the loop and * offset is not beyond the end offset, it will be a @@ -352,7 +352,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * offset is smaller than the first page offset, it will be a * hole at this offset. */ - if (lastoff == startoff && origin == SEEK_HOLE && + if (lastoff == startoff && whence == SEEK_HOLE && lastoff < page_offset(pvec.pages[0])) { found = 1; break; @@ -366,7 +366,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * If the current offset is not beyond the end of given * range, it will be a hole. */ - if (lastoff < endoff && origin == SEEK_HOLE && + if (lastoff < endoff && whence == SEEK_HOLE && page->index > end) { found = 1; *offset = lastoff; @@ -391,10 +391,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, do { if (buffer_uptodate(bh) || buffer_unwritten(bh)) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) found = 1; } else { - if (origin == SEEK_HOLE) + if (whence == SEEK_HOLE) found = 1; } if (found) { @@ -416,7 +416,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * The no. of pages is less than our desired, that would be a * hole in there. */ - if (nr_pages < num && origin == SEEK_HOLE) { + if (nr_pages < num && whence == SEEK_HOLE) { found = 1; *offset = lastoff; break; @@ -609,7 +609,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ -loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes; @@ -619,11 +619,11 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - switch (origin) { + switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_DATA: return ext4_seek_data(file, offset, maxbytes); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 78d2837bc940..e21d4d8f87e3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1599,19 +1599,19 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) return err ? 0 : outarg.block; } -static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; struct inode *inode = file->f_path.dentry->d_inode; /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ - if (origin == SEEK_CUR || origin == SEEK_SET) - return generic_file_llseek(file, offset, origin); + if (whence == SEEK_CUR || whence == SEEK_SET) + return generic_file_llseek(file, offset, whence); mutex_lock(&inode->i_mutex); retval = fuse_update_attributes(inode, NULL, file, NULL); if (!retval) - retval = generic_file_llseek(file, offset, origin); + retval = generic_file_llseek(file, offset, whence); mutex_unlock(&inode->i_mutex); return retval; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index dfe2d8cb9b2c..991ab2d484dd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -44,7 +44,7 @@ * gfs2_llseek - seek to a location in a file * @file: the file * @offset: the offset - * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) * * SEEK_END requires the glock for the file because it references the * file's size. @@ -52,26 +52,26 @@ * Returns: The new offset, or errno */ -static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) +static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_holder i_gh; loff_t error; - switch (origin) { + switch (whence) { case SEEK_END: /* These reference inode->i_size */ case SEEK_DATA: case SEEK_HOLE: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = generic_file_llseek(file, offset, origin); + error = generic_file_llseek(file, offset, whence); gfs2_glock_dq_uninit(&i_gh); } break; case SEEK_CUR: case SEEK_SET: - error = generic_file_llseek(file, offset, origin); + error = generic_file_llseek(file, offset, whence); break; default: error = -EINVAL; diff --git a/fs/libfs.c b/fs/libfs.c index 7cc37ca19cd8..35fc6e74cd88 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -81,11 +81,11 @@ int dcache_dir_close(struct inode *inode, struct file *file) return 0; } -loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) +loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; mutex_lock(&dentry->d_inode->i_mutex); - switch (origin) { + switch (whence) { case 1: offset += file->f_pos; case 0: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b9e66b7e0c14..1cc71f60b491 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -871,7 +871,7 @@ out: return res; } -static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -880,10 +880,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - offset, origin); + offset, whence); mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case 1: offset += filp->f_pos; case 0: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 582bb8866131..3c2b893665ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -119,18 +119,18 @@ force_reval: return __nfs_revalidate_inode(server, inode); } -loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence) { dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, - offset, origin); + offset, whence); /* - * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate * the cached file length */ - if (origin != SEEK_SET && origin != SEEK_CUR) { + if (whence != SEEK_SET && whence != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); @@ -138,7 +138,7 @@ loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) return (loff_t)retval; } - return generic_file_llseek(filp, offset, origin); + return generic_file_llseek(filp, offset, whence); } EXPORT_SYMBOL_GPL(nfs_file_llseek); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 70b5863a2d64..f487aa343442 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -832,7 +832,7 @@ out: return ret; } -int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; @@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); + BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE); ret = ocfs2_inode_lock(inode, &di_bh, 0); if (ret) { @@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - if (origin == SEEK_HOLE) + if (whence == SEEK_HOLE) *offset = inode->i_size; goto out_unlock; } @@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; } - if ((!is_data && origin == SEEK_HOLE) || - (is_data && origin == SEEK_DATA)) { + if ((!is_data && whence == SEEK_HOLE) || + (is_data && whence == SEEK_DATA)) { if (extoff > *offset) *offset = extoff; goto out_unlock; @@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) cpos += clen; } - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { extoff = cpos; extoff <<= cs_bits; extlen = clen; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index dda089804942..fe492e1a3cfc 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2637,14 +2637,14 @@ bail: } /* Refer generic_file_llseek_unlocked() */ -static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret = 0; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_SET: break; case SEEK_END: @@ -2659,7 +2659,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) break; case SEEK_DATA: case SEEK_HOLE: - ret = ocfs2_seek_data_hole_offset(file, &offset, origin); + ret = ocfs2_seek_data_hole_offset(file, &offset, whence); if (ret) goto out; break; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ed1d8c7212da..67de74ca85f4 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -151,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file) return 0; } -static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) +static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) { struct seq_file *sf = file->private_data; if (sf->op) - return seq_lseek(file, off, origin); - return default_llseek(file, off, origin); + return seq_lseek(file, off, whence); + return default_llseek(file, off, whence); } static const struct file_operations pstore_file_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index d06534857e9e..1edaf099ddd7 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * @size: max size of this file in file system * @eof: offset used for SEEK_END position * @@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, * read/writes behave like SEEK_SET against seeks. */ loff_t -generic_file_llseek_size(struct file *file, loff_t offset, int origin, +generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { struct inode *inode = file->f_mapping->host; - switch (origin) { + switch (whence) { case SEEK_END: offset += eof; break; @@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size); * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by - * @offset and @origin under i_mutex. + * @offset and @whence under i_mutex. */ -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } @@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek); * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to - * @origin: type of seek + * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ -loff_t noop_llseek(struct file *file, loff_t offset, int origin) +loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int origin) +loff_t no_llseek(struct file *file, loff_t offset, int whence) { return -ESPIPE; } EXPORT_SYMBOL(no_llseek); -loff_t default_llseek(struct file *file, loff_t offset, int origin) +loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_path.dentry->d_inode; loff_t retval; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_END: offset += i_size_read(inode); break; @@ -216,7 +216,7 @@ out: } EXPORT_SYMBOL(default_llseek); -loff_t vfs_llseek(struct file *file, loff_t offset, int origin) +loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { loff_t (*fn)(struct file *, loff_t, int); @@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) if (file->f_op && file->f_op->llseek) fn = file->f_op->llseek; } - return fn(file, offset, origin); + return fn(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); -SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; struct fd f = fdget(fd); @@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) return -EBADF; retval = -EINVAL; - if (origin <= SEEK_MAX) { - loff_t res = vfs_llseek(f.file, offset, origin); + if (whence <= SEEK_MAX) { + loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ @@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) #ifdef __ARCH_WANT_SYS_LLSEEK SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, - unsigned int, origin) + unsigned int, whence) { int retval; struct fd f = fdget(fd); @@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, return -EBADF; retval = -EINVAL; - if (origin > SEEK_MAX) + if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, - origin); + whence); retval = (int)offset; if (offset >= 0) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 99dffab4c4e4..9d863fb501f9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -300,14 +300,14 @@ EXPORT_SYMBOL(seq_read); * * Ready-made ->f_op->llseek() */ -loff_t seq_lseek(struct file *file, loff_t offset, int origin) +loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); m->version = file->f_version; - switch (origin) { + switch (whence) { case 1: offset += file->f_pos; case 0: diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e271fba1651b..8a574776a493 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -453,11 +453,11 @@ out: } /* If a directory is seeked, we have to free saved readdir() state */ -static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) { kfree(file->private_data); file->private_data = NULL; - return generic_file_llseek(file, offset, origin); + return generic_file_llseek(file, offset, whence); } /* Free saved readdir() state when the directory is closed */ -- cgit v1.2.1 From ac5f121b8f2cad52b7671f9af872f8761b0ea1d4 Mon Sep 17 00:00:00 2001 From: Tushar Behera Date: Mon, 17 Dec 2012 15:59:40 -0800 Subject: fs/notify/inode_mark.c: make fsnotify_find_inode_mark_locked() static Fixes following sparse warning: fs/notify/inode_mark.c:127:22: warning: symbol 'fsnotify_find_inode_mark_locked' was not declared. Should it be static? Signed-off-by: Tushar Behera Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inode_mark.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index b13c00ac48eb..f3035691f528 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -116,8 +116,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) * given a group and inode, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL */ -struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group, - struct inode *inode) +static struct fsnotify_mark *fsnotify_find_inode_mark_locked( + struct fsnotify_group *group, + struct inode *inode) { struct fsnotify_mark *mark; struct hlist_node *pos; -- cgit v1.2.1 From f9a00e8738c209d95493cf97d3a82ab2655892e5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Dec 2012 16:01:25 -0800 Subject: procfs: use kbasename() [yongjun_wei@trendmicro.com.cn: remove duplicated include] Signed-off-by: Andy Shevchenko Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_devtree.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index df7dd08d4391..de20ec480fa0 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np, set_node_proc_entry(np, de); for (child = NULL; (child = of_get_next_child(np, child));) { /* Use everything after the last slash, or the full name */ - p = strrchr(child->full_name, '/'); - if (!p) - p = child->full_name; - else - ++p; + p = kbasename(child->full_name); if (duplicate_name(de, p)) p = fixup_name(np, de, p); -- cgit v1.2.1 From 6899e92d65c490c5292752718ff277b123f8c00a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 17 Dec 2012 16:02:09 -0800 Subject: binfmt_elf: fix corner case kfree of uninitialized data If elf_core_dump() is called and fill_note_info() fails in the kmalloc() then it returns 0 but has not yet initialised all the needed fields. As a result we do a kfree(randomness) after correctly skipping the thread data. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6d7d1647a68c..0c42cdbabecf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1601,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - if (psinfo == NULL) + if (psinfo == NULL) { + info->psinfo.data = NULL; /* So we don't free this wrongly */ return 0; + } fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); -- cgit v1.2.1 From f562146a3daf6aa0bbf2a1bc4b6b7da031ed5dcd Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 17 Dec 2012 16:02:56 -0800 Subject: fat: notify when discard is not supported Change fatfs so that a warning is emitted when an attempt is made to mount a filesystem with the unsupported `discard' option. ext4 aready does this: http://patchwork.ozlabs.org/patch/192668/ Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5bafaad00530..7b186a5d51b1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "fat.h" @@ -1431,6 +1432,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } + if (sbi->options.discard) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + fat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + return 0; out_invalid: -- cgit v1.2.1 From 58156c8fbf43e71dd091848d4dbfd780d04016e6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Dec 2012 16:02:58 -0800 Subject: fat: provide option for setting timezone offset So far FAT either offsets time stamps by sys_tz.minuteswest or leaves them as they are (when tz=UTC mount option is used). However in some cases it is useful if one can specify time stamp offset on his own (e.g. when time zone of the camera connected is different from time zone of the computer, or when HW clock is in UTC and thus sys_tz.minuteswest == 0). So provide a mount option time_offset= which allows user to specify offset in minutes that should be applied to time stamps on the filesystem. akpm: this code would work incorrectly when used via `mount -o remount', because cached inodes would not be updated. But fatfs's fat_remount() is basically a no-op anyway. Signed-off-by: Jan Kara Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fat.h | 3 ++- fs/fat/inode.c | 25 ++++++++++++++++++++----- fs/fat/misc.c | 9 ++++++--- 3 files changed, 28 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 623f36f0423b..12701a567752 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -29,6 +29,7 @@ struct fat_mount_options { unsigned short fs_fmask; unsigned short fs_dmask; unsigned short codepage; /* Codepage for shortname conversions */ + int time_offset; /* Offset of timestamps from UTC (in minutes) */ char *iocharset; /* Charset used for filename input/display */ unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ @@ -45,7 +46,7 @@ struct fat_mount_options { flush:1, /* write things quickly */ nocase:1, /* Does this need case conversion? 0=need case conversion*/ usefree:1, /* Use free_clusters for FAT32 */ - tz_utc:1, /* Filesystem timestamps are in UTC */ + tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1, /* Issue discard requests on deletions */ nfs:1; /* Do extra work needed for NFS export */ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 7b186a5d51b1..59ac83be2d5b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -778,8 +778,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) } if (opts->flush) seq_puts(m, ",flush"); - if (opts->tz_utc) - seq_puts(m, ",tz=UTC"); + if (opts->tz_set) { + if (opts->time_offset) + seq_printf(m, ",time_offset=%d", opts->time_offset); + else + seq_puts(m, ",tz=UTC"); + } if (opts->errors == FAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == FAT_ERRORS_PANIC) @@ -801,7 +805,8 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, + Opt_err, }; static const match_table_t fat_tokens = { @@ -826,6 +831,7 @@ static const match_table_t fat_tokens = { {Opt_immutable, "sys_immutable"}, {Opt_flush, "flush"}, {Opt_tz_utc, "tz=UTC"}, + {Opt_time_offset, "time_offset=%d"}, {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, @@ -910,7 +916,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->utf8 = opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; - opts->tz_utc = 0; + opts->tz_set = 0; opts->nfs = 0; opts->errors = FAT_ERRORS_RO; *debug = 0; @@ -1006,8 +1012,17 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_flush: opts->flush = 1; break; + case Opt_time_offset: + if (match_int(&args[0], &option)) + return 0; + if (option < -12 * 60 || option > 12 * 60) + return 0; + opts->tz_set = 1; + opts->time_offset = option; + break; case Opt_tz_utc: - opts->tz_utc = 1; + opts->tz_set = 1; + opts->time_offset = 0; break; case Opt_err_cont: opts->errors = FAT_ERRORS_CONT; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 6d93360ca0cc..5eb600dc43a9 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -212,8 +212,10 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; - if (!sbi->options.tz_utc) + if (!sbi->options.tz_set) second += sys_tz.tz_minuteswest * SECS_PER_MIN; + else + second -= sbi->options.time_offset * SECS_PER_MIN; if (time_cs) { ts->tv_sec = second + (time_cs / 100); @@ -229,8 +231,9 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; - time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 : - -sys_tz.tz_minuteswest * 60, &tm); + time_to_tm(ts->tv_sec, + (sbi->options.tz_set ? sbi->options.time_offset : + -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); /* FAT can only support year between 1980 to 2107 */ if (tm.tm_year < 1980 - 1900) { -- cgit v1.2.1 From 5b3d5aeaa333850756f41350fed2fc95912b2a4f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Dec 2012 16:02:59 -0800 Subject: fat: ix mount option parsing parse_options() is supposed to return value < 0 on error however we returned 0 (success) in a lot of cases. This actually was not a problem in practice because match_token() used by parse_options() is clever and catches most of the problems for us. Signed-off-by: Jan Kara Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 59ac83be2d5b..3b733a730952 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -972,41 +972,41 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, break; case Opt_uid: if (match_int(&args[0], &option)) - return 0; + return -EINVAL; opts->fs_uid = make_kuid(current_user_ns(), option); if (!uid_valid(opts->fs_uid)) - return 0; + return -EINVAL; break; case Opt_gid: if (match_int(&args[0], &option)) - return 0; + return -EINVAL; opts->fs_gid = make_kgid(current_user_ns(), option); if (!gid_valid(opts->fs_gid)) - return 0; + return -EINVAL; break; case Opt_umask: if (match_octal(&args[0], &option)) - return 0; + return -EINVAL; opts->fs_fmask = opts->fs_dmask = option; break; case Opt_dmask: if (match_octal(&args[0], &option)) - return 0; + return -EINVAL; opts->fs_dmask = option; break; case Opt_fmask: if (match_octal(&args[0], &option)) - return 0; + return -EINVAL; opts->fs_fmask = option; break; case Opt_allow_utime: if (match_octal(&args[0], &option)) - return 0; + return -EINVAL; opts->allow_utime = option & (S_IWGRP | S_IWOTH); break; case Opt_codepage: if (match_int(&args[0], &option)) - return 0; + return -EINVAL; opts->codepage = option; break; case Opt_flush: @@ -1014,9 +1014,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, break; case Opt_time_offset: if (match_int(&args[0], &option)) - return 0; + return -EINVAL; if (option < -12 * 60 || option > 12 * 60) - return 0; + return -EINVAL; opts->tz_set = 1; opts->time_offset = option; break; -- cgit v1.2.1 From c6c20372bbb2f70d2757eed0a8d6860884bae11f Mon Sep 17 00:00:00 2001 From: Dave Reisner Date: Mon, 17 Dec 2012 16:03:01 -0800 Subject: fs/fat: strip "cp" prefix from codepage in display Option parsing code expects an unsigned integer for the codepage option, but prefixes and stores this option with "cp" before passing to load_nls(). This makes the displayed option in /proc an invalid one. Strip the prefix when printing so that the displayed option is valid for reuse. Signed-off-by: Dave Reisner Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3b733a730952..35806813ea4e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -726,7 +726,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) if (opts->allow_utime) seq_printf(m, ",allow_utime=%04o", opts->allow_utime); if (sbi->nls_disk) - seq_printf(m, ",codepage=%s", sbi->nls_disk->charset); + /* strip "cp" prefix from displayed option */ + seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]); if (isvfat) { if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); -- cgit v1.2.1 From 7b9a7ec565505699f503b4fcf61500dceb36e744 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Dec 2012 16:03:10 -0800 Subject: proc: don't show nonexistent capabilities Without this patch it is really hard to interpret a bounding set, if CAP_LAST_CAP is unknown for a current kernel. Non-existant capabilities can not be deleted from a bounding set with help of prctl. E.g.: Here are two examples without/with this patch. CapBnd: ffffffe0fdecffff CapBnd: 00000000fdecffff I suggest to hide non-existent capabilities. Here is two reasons. * It's logically and easier for using. * It helps to checkpoint-restore capabilities of tasks, because tasks can be restored on another kernel, where CAP_LAST_CAP is bigger. Signed-off-by: Andrew Vagin Cc: Andrew G. Morgan Reviewed-by: Serge E. Hallyn Cc: Pavel Emelyanov Reviewed-by: Kees Cook Cc: KAMEZAWA Hiroyuki Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index d3696708fc1a..377a37366dde 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } +/* Remove non-existent capabilities */ +#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ + CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) + static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -321,6 +325,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); + NORM_CAPS(cap_inheritable); + NORM_CAPS(cap_permitted); + NORM_CAPS(cap_effective); + NORM_CAPS(cap_bset); + render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); -- cgit v1.2.1 From 834f82e2aa9a8ede94b17b656329f850c1471514 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:03:13 -0800 Subject: procfs: add VmFlags field in smaps output During c/r sessions we've found that there is no way at the moment to fetch some VMA associated flags, such as mlock() and madvise(). This leads us to a problem -- we don't know if we should call for mlock() and/or madvise() after restore on the vma area we're bringing back to life. This patch intorduces a new field into "smaps" output called VmFlags, where all set flags associated with the particular VMA is shown as two letter mnemonics. [ Strictly speaking for c/r we only need mlock/madvise bits but it has been said that providing just a few flags looks somehow inconsistent. So all flags are here now. ] This feature is made available on CONFIG_CHECKPOINT_RESTORE=n kernels, as other applications may start to use these fields. The data is encoded in a somewhat awkward two letters mnemonic form, to encourage userspace to be prepared for fields being added or removed in the future. [a.p.zijlstra@chello.nl: props to use for_each_set_bit] [sfr@canb.auug.org.au: props to use array instead of struct] [akpm@linux-foundation.org: overall redesign and simplification] [akpm@linux-foundation.org: remove unneeded braces per sfr, avoid using bloaty for_each_set_bit()] Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 48775628abbf..448455b7fd91 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_NONLINEAR)] = "nl", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) seq_printf(m, "Nonlinear: %8lu kB\n", mss.nonlinear >> 10); + show_smap_vma_flags(m, vma); + if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) ? vma->vm_start : 0; -- cgit v1.2.1 From 2f4b3bf6b2318cfaa177ec5a802f4d8d6afbd816 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Dec 2012 16:03:14 -0800 Subject: /proc/pid/status: add "Seccomp" field It is currently impossible to examine the state of seccomp for a given process. While attaching with gdb and attempting "call prctl(PR_GET_SECCOMP,...)" will work with some situations, it is not reliable. If the process is in seccomp mode 1, this query will kill the process (prctl not allowed), if the process is in mode 2 with prctl not allowed, it will similarly be killed, and in weird cases, if prctl is filtered to return errno 0, it can look like seccomp is disabled. When reviewing the state of running processes, there should be a way to externally examine the seccomp mode. ("Did this build of Chrome end up using seccomp?" "Did my distro ship ssh with seccomp enabled?") This adds the "Seccomp" line to /proc/$pid/status. Signed-off-by: Kees Cook Reviewed-by: Cyrill Gorcunov Cc: Andrea Arcangeli Cc: James Morris Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 377a37366dde..077235ffb38b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -336,6 +336,13 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) render_cap_t(m, "CapBnd:\t", &cap_bset); } +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); +#endif +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -369,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); -- cgit v1.2.1 From 8d238027b87e654be552eabdf492042a34c5c300 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Dec 2012 16:03:17 -0800 Subject: proc: pid/status: show all supplementary groups We display a list of supplementary group for each process in /proc//status. However, we show only the first 32 groups, not all of them. Although this is rare, but sometimes processes do have more than 32 supplementary groups, and this kernel limitation breaks user-space apps that rely on the group list in /proc//status. Number 32 comes from the internal NGROUPS_SMALL macro which defines the length for the internal kernel "small" groups buffer. There is no apparent reason to limit to this value. This patch removes the 32 groups printing limit. The Linux kernel limits the amount of supplementary groups by NGROUPS_MAX, which is currently set to 65536. And this is the maximum count of groups we may possibly print. Signed-off-by: Artem Bityutskiy Acked-by: Serge E. Hallyn Acked-by: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 077235ffb38b..439544fec388 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; task_unlock(p); - for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); -- cgit v1.2.1 From d740269867021faf4ce38a449353d2b986c34a67 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Dec 2012 16:03:20 -0800 Subject: exec: use -ELOOP for max recursion depth To avoid an explosion of request_module calls on a chain of abusive scripts, fail maximum recursion with -ELOOP instead of -ENOEXEC. As soon as maximum recursion depth is hit, the error will fail all the way back up the chain, aborting immediately. This also has the side-effect of stopping the user's shell from attempting to reexecute the top-level file as a shell script. As seen in the dash source: if (cmd != path_bshell && errno == ENOEXEC) { *argv-- = cmd; *argv = cmd = path_bshell; goto repeat; } The above logic was designed for running scripts automatically that lacked the "#!" header, not to re-try failed recursion. On a legitimate -ENOEXEC, things continue to behave as the shell expects. Additionally, when tracking recursion, the binfmt handlers should not be involved. The recursion being tracked is the depth of calls through search_binary_handler(), so that function should be exclusively responsible for tracking the depth. Signed-off-by: Kees Cook Cc: halfdog Cc: P J P Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_em86.c | 1 - fs/binfmt_misc.c | 6 ------ fs/binfmt_script.c | 4 +--- fs/exec.c | 10 +++++----- 4 files changed, 6 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 4e6cce57d113..037a3e2b045b 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm) return -ENOEXEC; } - bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index b0b70fbea06c..9be335fb8a7c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!enabled) goto _ret; - retval = -ENOEXEC; - if (bprm->recursion_depth > BINPRM_MAX_RECURSION) - goto _ret; - /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -197,8 +193,6 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto _error; - bprm->recursion_depth++; - retval = search_binary_handler(bprm); if (retval < 0) goto _error; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 8c954997e7f7..1610a91637e5 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm) char interp[BINPRM_BUF_SIZE]; int retval; - if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || - (bprm->recursion_depth > BINPRM_MAX_RECURSION)) + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT */ - bprm->recursion_depth++; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/exec.c b/fs/exec.c index 721a29929511..d5eb9e605ffd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1356,6 +1356,10 @@ int search_binary_handler(struct linux_binprm *bprm) struct linux_binfmt *fmt; pid_t old_pid, old_vpid; + /* This allows 4 levels of binfmt rewrites before failing hard. */ + if (depth > 5) + return -ELOOP; + retval = security_bprm_check(bprm); if (retval) return retval; @@ -1380,12 +1384,8 @@ int search_binary_handler(struct linux_binprm *bprm) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); + bprm->recursion_depth = depth + 1; retval = fn(bprm); - /* - * Restore the depth counter to its starting value - * in this call, so we don't have to rely on every - * load_binary function to restore it on return. - */ bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) { -- cgit v1.2.1 From cdd9fa8de64bc5b33d8e943dde486b60d8468ec0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 17 Dec 2012 16:04:35 -0800 Subject: ubifs: use prandom_bytes This also converts filling memory loop to use memset. Signed-off-by: Akinobu Mita Cc: Artem Bityutskiy Cc: Adrian Hunter Cc: "Theodore Ts'o" Cc: David Laight Cc: David Woodhouse Cc: Eilon Greenstein Cc: Michel Lespinasse Cc: Robert Love Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ubifs/debug.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 62911637e12f..12817ffc7345 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int len) { - unsigned int from, to, i, ffs = chance(1, 2); + unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; from = random32() % (len + 1); @@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, ffs ? "0xFFs" : "random data"); if (ffs) - for (i = from; i < to; i++) - p[i] = 0xFF; + memset(p + from, 0xFF, to - from); else - for (i = from; i < to; i++) - p[i] = random32() % 0x100; + prandom_bytes(p + from, to - from); return to; } -- cgit v1.2.1 From 55985dd72ab27b47530dcc8bdddd28b69f4abe8b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:04:55 -0800 Subject: procfs: add ability to plug in auxiliary fdinfo providers This patch brings ability to print out auxiliary data associated with file in procfs interface /proc/pid/fdinfo/fd. In particular further patches make eventfd, evenpoll, signalfd and fsnotify to print additional information complete enough to restore these objects after checkpoint. To simplify the code we add show_fdinfo callback inside struct file_operations (as Al and Pavel are proposing). Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/fd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index f28a875f8779..d7a4a28ef630 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v) if (!ret) { seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", (long long)file->f_pos, f_flags); + if (file->f_op->show_fdinfo) + ret = file->f_op->show_fdinfo(m, file); fput(file); } -- cgit v1.2.1 From cbac5542d48127b546a23d816380a7926eee1c25 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:04:57 -0800 Subject: fs, eventfd: add procfs fdinfo helper This allows us to print out raw counter value. The /proc/pid/fdinfo/fd output is | pos: 0 | flags: 04002 | eventfd-count: 5a Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/eventfd.c b/fs/eventfd.c index d81b9f654086..35470d9b96e6 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include struct eventfd_ctx { struct kref kref; @@ -284,7 +286,25 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c return res; } +#ifdef CONFIG_PROC_FS +static int eventfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventfd_ctx *ctx = f->private_data; + int ret; + + spin_lock_irq(&ctx->wqh.lock); + ret = seq_printf(m, "eventfd-count: %16llx\n", + (unsigned long long)ctx->count); + spin_unlock_irq(&ctx->wqh.lock); + + return ret; +} +#endif + static const struct file_operations eventfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = eventfd_show_fdinfo, +#endif .release = eventfd_release, .poll = eventfd_poll, .read = eventfd_read, -- cgit v1.2.1 From 138d22b58696c506799f8de759804083ff9effae Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:02 -0800 Subject: fs, epoll: add procfs fdinfo helper This allows us to print out eventpoll target file descriptor, events and data, the /proc/pid/fdinfo/fd consists of | pos: 0 | flags: 02 | tfd: 5 events: 1d data: ffffffffffffffff enabled: 1 [avagin@: fix for unitialized ret variable] Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 28 ++++++++++++++++++++++++++++ fs/proc/array.c | 2 +- fs/signalfd.c | 18 ++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cd96649bfe62..be56b21435f8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -783,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) return pollflags != -1 ? pollflags : 0; } +#ifdef CONFIG_PROC_FS +static int ep_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventpoll *ep = f->private_data; + struct rb_node *rbp; + int ret = 0; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi = rb_entry(rbp, struct epitem, rbn); + + ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (ret) + break; + } + mutex_unlock(&ep->mtx); + + return ret; +} +#endif + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = ep_show_fdinfo, +#endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, diff --git a/fs/proc/array.c b/fs/proc/array.c index 439544fec388..060a56a91278 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_putc(m, '\n'); } -static void render_sigset_t(struct seq_file *m, const char *header, +void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; diff --git a/fs/signalfd.c b/fs/signalfd.c index 8bee4e570911..b53486961735 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -29,6 +29,7 @@ #include #include #include +#include void signalfd_cleanup(struct sighand_struct *sighand) { @@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, return total ? total: ret; } +#ifdef CONFIG_PROC_FS +static int signalfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct signalfd_ctx *ctx = f->private_data; + sigset_t sigmask; + + sigmask = ctx->sigmask; + signotset(&sigmask); + render_sigset_t(m, "sigmask:\t", &sigmask); + + return 0; +} +#endif + static const struct file_operations signalfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = signalfd_show_fdinfo, +#endif .release = signalfd_release, .poll = signalfd_poll, .read = signalfd_read, -- cgit v1.2.1 From ab49bdecc3ebb46ab661f5f05d5c5ea9606406c6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:06 -0800 Subject: fs, exportfs: escape nil dereference if no s_export_op present This routine will be used to generate a file handle in fdinfo output for inotify subsystem, where if no s_export_op present the general export_encode_fh should be used. Thus add a test if s_export_op present inside exportfs_encode_fh itself. Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 29ab099e3e08..10f137381ac7 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -357,7 +357,7 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, */ parent = p->d_inode; } - if (nop->encode_fh) + if (nop && nop->encode_fh) error = nop->encode_fh(inode, fid->raw, max_len, parent); else error = export_encode_fh(inode, fid, max_len, parent); -- cgit v1.2.1 From 711c7bf9914060d7aaf3c1a15f38094a5d5e748f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:08 -0800 Subject: fs, exportfs: add exportfs_encode_inode_fh() helper We will need this helper in the next patch to provide a file handle for inotify marks in /proc/pid/fdinfo output. The patch is rather providing the way to use inodes directly when dentry is not available (like in case of inotify system). Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 10f137381ac7..606bb074c501 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -341,10 +341,21 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, return type; } +int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, + int *max_len, struct inode *parent) +{ + const struct export_operations *nop = inode->i_sb->s_export_op; + + if (nop && nop->encode_fh) + return nop->encode_fh(inode, fid->raw, max_len, parent); + + return export_encode_fh(inode, fid, max_len, parent); +} +EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); + int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int connectable) { - const struct export_operations *nop = dentry->d_sb->s_export_op; int error; struct dentry *p = NULL; struct inode *inode = dentry->d_inode, *parent = NULL; @@ -357,10 +368,8 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, */ parent = p->d_inode; } - if (nop && nop->encode_fh) - error = nop->encode_fh(inode, fid->raw, max_len, parent); - else - error = export_encode_fh(inode, fid, max_len, parent); + + error = exportfs_encode_inode_fh(inode, fid, max_len, parent); dput(p); return error; -- cgit v1.2.1 From be77196b809cdce8603a5aadd5e3cfabd3cbef96 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:12 -0800 Subject: fs, notify: add procfs fdinfo helper This allow us to print out fsnotify details such as watchee inode, device, mask and optionally a file handle. For inotify objects if kernel compiled with exportfs support the output will be | pos: 0 | flags: 02000000 | inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d | inotify wd:2 ino:a111 sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:11a1000020542153 | inotify wd:1 ino:6b149 sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:49b1060023552153 If kernel compiled without exportfs support, the file handle won't be provided but inode and device only. | pos: 0 | flags: 02000000 | inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 | inotify wd:2 ino:a111 sdev:800013 mask:800afce ignored_mask:0 | inotify wd:1 ino:6b149 sdev:800013 mask:800afce ignored_mask:0 For fanotify the output is like | pos: 0 | flags: 04002 | fanotify flags:10 event-flags:0 | fanotify mnt_id:12 mask:3b ignored_mask:0 | fanotify ino:50205 sdev:800013 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:05020500fb1d47e7 To minimize impact on general fsnotify code the new functionality is gathered in fs/notify/fdinfo.c file. Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/Makefile | 2 +- fs/notify/fanotify/fanotify_user.c | 2 + fs/notify/fdinfo.c | 175 +++++++++++++++++++++++++++++++++++++ fs/notify/fdinfo.h | 27 ++++++ fs/notify/inotify/inotify_user.c | 2 + 5 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 fs/notify/fdinfo.c create mode 100644 fs/notify/fdinfo.h (limited to 'fs') diff --git a/fs/notify/Makefile b/fs/notify/Makefile index ae5f33a6d868..96d3420d0242 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ - mark.o vfsmount_mark.o + mark.o vfsmount_mark.o fdinfo.o obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6fcaeb8c902e..a5cd9bba022f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -17,6 +17,7 @@ #include #include "../../mount.h" +#include "../fdinfo.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -428,6 +429,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar } static const struct file_operations fanotify_fops = { + .show_fdinfo = fanotify_show_fdinfo, .poll = fanotify_poll, .read = fanotify_read, .write = fanotify_write, diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c new file mode 100644 index 000000000000..cb996179abfd --- /dev/null +++ b/fs/notify/fdinfo.c @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "inotify/inotify.h" +#include "../fs/mount.h" + +#if defined(CONFIG_PROC_FS) + +#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY) + +static int show_fdinfo(struct seq_file *m, struct file *f, + int (*show)(struct seq_file *m, struct fsnotify_mark *mark)) +{ + struct fsnotify_group *group = f->private_data; + struct fsnotify_mark *mark; + int ret = 0; + + spin_lock(&group->mark_lock); + list_for_each_entry(mark, &group->marks_list, g_list) { + ret = show(m, mark); + if (ret) + break; + } + spin_unlock(&group->mark_lock); + return ret; +} + +#if defined(CONFIG_EXPORTFS) +static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ + struct { + struct file_handle handle; + u8 pad[64]; + } f; + int size, ret, i; + + f.handle.handle_bytes = sizeof(f.pad); + size = f.handle.handle_bytes >> 2; + + ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); + if ((ret == 255) || (ret == -ENOSPC)) { + WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + return 0; + } + + f.handle.handle_type = ret; + f.handle.handle_bytes = size * sizeof(u32); + + ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", + f.handle.handle_bytes, f.handle.handle_type); + + for (i = 0; i < f.handle.handle_bytes; i++) + ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]); + + return ret; +} +#else +static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +{ + return 0; +} +#endif + +#ifdef CONFIG_INOTIFY_USER + +static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + struct inotify_inode_mark *inode_mark; + struct inode *inode; + int ret = 0; + + if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + return 0; + + inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); + inode = igrab(mark->i.inode); + if (inode) { + ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x " + "mask:%x ignored_mask:%x ", + inode_mark->wd, inode->i_ino, + inode->i_sb->s_dev, + mark->mask, mark->ignored_mask); + ret |= show_mark_fhandle(m, inode); + ret |= seq_putc(m, '\n'); + iput(inode); + } + + return ret; +} + +int inotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + return show_fdinfo(m, f, inotify_fdinfo); +} + +#endif /* CONFIG_INOTIFY_USER */ + +#ifdef CONFIG_FANOTIFY + +static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +{ + struct inode *inode; + int ret = 0; + + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) + return 0; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { + inode = igrab(mark->i.inode); + if (!inode) + goto out; + ret = seq_printf(m, "fanotify ino:%lx sdev:%x " + "mask:%x ignored_mask:%x ", + inode->i_ino, inode->i_sb->s_dev, + mark->mask, mark->ignored_mask); + ret |= show_mark_fhandle(m, inode); + ret |= seq_putc(m, '\n'); + iput(inode); + } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { + struct mount *mnt = real_mount(mark->m.mnt); + + ret = seq_printf(m, "fanotify mnt_id:%x mask:%x " + "ignored_mask:%x\n", + mnt->mnt_id, mark->mask, mark->ignored_mask); + } +out: + return ret; +} + +int fanotify_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct fsnotify_group *group = f->private_data; + unsigned int flags = 0; + + switch (group->priority) { + case FS_PRIO_0: + flags |= FAN_CLASS_NOTIF; + break; + case FS_PRIO_1: + flags |= FAN_CLASS_CONTENT; + break; + case FS_PRIO_2: + flags |= FAN_CLASS_PRE_CONTENT; + break; + } + + if (group->max_events == UINT_MAX) + flags |= FAN_UNLIMITED_QUEUE; + + if (group->fanotify_data.max_marks == UINT_MAX) + flags |= FAN_UNLIMITED_MARKS; + + seq_printf(m, "fanotify flags:%x event-flags:%x\n", + flags, group->fanotify_data.f_flags); + + return show_fdinfo(m, f, fanotify_fdinfo); +} + +#endif /* CONFIG_FANOTIFY */ + +#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */ + +#endif /* CONFIG_PROC_FS */ diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h new file mode 100644 index 000000000000..556afda990e9 --- /dev/null +++ b/fs/notify/fdinfo.h @@ -0,0 +1,27 @@ +#ifndef __FSNOTIFY_FDINFO_H__ +#define __FSNOTIFY_FDINFO_H__ + +#include +#include + +struct seq_file; +struct file; + +#ifdef CONFIG_PROC_FS + +#ifdef CONFIG_INOTIFY_USER +extern int inotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#ifdef CONFIG_FANOTIFY +extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f); +#endif + +#else /* CONFIG_PROC_FS */ + +#define inotify_show_fdinfo NULL +#define fanotify_show_fdinfo NULL + +#endif /* CONFIG_PROC_FS */ + +#endif /* __FSNOTIFY_FDINFO_H__ */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index c311dda054a3..36cb013c7c13 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -40,6 +40,7 @@ #include #include "inotify.h" +#include "../fdinfo.h" #include @@ -335,6 +336,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, } static const struct file_operations inotify_fops = { + .show_fdinfo = inotify_show_fdinfo, .poll = inotify_poll, .read = inotify_read, .fasync = inotify_fasync, -- cgit v1.2.1 From e6dbcafb744ab94a94142a6e721e16330397fad8 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:16 -0800 Subject: fs, fanotify: add @mflags field to fanotify output The kernel keeps FAN_MARK_IGNORED_SURV_MODIFY bit separately from fsnotify_mark::mask|ignored_mask thus put it in @mflags (mark flags) field so the user-space reader will be able to detect if such bit were used on mark creation procedure. | pos: 0 | flags: 04002 | fanotify flags:10 event-flags:0 | fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 | fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index cb996179abfd..514c4b81483d 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -111,29 +111,33 @@ int inotify_show_fdinfo(struct seq_file *m, struct file *f) static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { + unsigned int mflags = 0; struct inode *inode; int ret = 0; if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) return 0; + if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + mflags |= FAN_MARK_IGNORED_SURV_MODIFY; + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { inode = igrab(mark->i.inode); if (!inode) goto out; ret = seq_printf(m, "fanotify ino:%lx sdev:%x " - "mask:%x ignored_mask:%x ", + "mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignored_mask); ret |= show_mark_fhandle(m, inode); ret |= seq_putc(m, '\n'); iput(inode); } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { struct mount *mnt = real_mount(mark->m.mnt); - ret = seq_printf(m, "fanotify mnt_id:%x mask:%x " - "ignored_mask:%x\n", - mnt->mnt_id, mark->mask, mark->ignored_mask); + ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x " + "ignored_mask:%x\n", mnt->mnt_id, mflags, + mark->mask, mark->ignored_mask); } out: return ret; -- cgit v1.2.1 From d5f50b0c290431c65377c4afa1c764e2c3fe5305 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 4 Dec 2012 18:25:10 -0500 Subject: nfsd4: fix oops on unusual readlike compound If the argument and reply together exceed the maximum payload size, then a reply with a read-like operation can overlow the rq_pages array. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 3bf8a9d7f217..d7a3be5ab777 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2925,11 +2925,16 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, len = maxcount; v = 0; while (len > 0) { - pn = resp->rqstp->rq_resused++; + pn = resp->rqstp->rq_resused; + if (!resp->rqstp->rq_respages[pn]) { /* ran out of pages */ + maxcount -= len; + break; + } resp->rqstp->rq_vec[v].iov_base = page_address(resp->rqstp->rq_respages[pn]); resp->rqstp->rq_vec[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE; + resp->rqstp->rq_resused++; v++; len -= PAGE_SIZE; } @@ -2975,6 +2980,8 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd return nfserr; if (resp->xbuf->page_len) return nfserr_resource; + if (!resp->rqstp->rq_respages[resp->rqstp->rq_resused]) + return nfserr_resource; page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); @@ -3024,6 +3031,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 return nfserr; if (resp->xbuf->page_len) return nfserr_resource; + if (!resp->rqstp->rq_respages[resp->rqstp->rq_resused]) + return nfserr_resource; RESERVE_SPACE(NFS4_VERIFIER_SIZE); savep = p; -- cgit v1.2.1 From 79f77bf9a4e3dd5ead006b8f17e7c4ff07d8374e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Dec 2012 18:04:53 -0500 Subject: nfsd: warn on odd reply state in nfsd_vfs_read As far as I can tell this shouldn't currently happen--or if it does, something is wrong and data is going to be corrupted. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0ef9b6b410a2..b31e46eeb026 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -936,6 +936,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, .u.data = rqstp, }; + WARN_ON_ONCE(rqstp->rq_resused != 1); rqstp->rq_resused = 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { -- cgit v1.2.1 From afc59400d6c65bad66d4ad0b2daf879cbff8e23e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 10 Dec 2012 18:01:37 -0500 Subject: nfsd4: cleanup: replace rq_resused count by rq_next_page pointer It may be a matter of personal taste, but I find this makes the code clearer. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs2acl.c | 2 +- fs/nfsd/nfs3acl.c | 2 +- fs/nfsd/nfs3proc.c | 6 +++--- fs/nfsd/nfs3xdr.c | 33 ++++++++++++++++----------------- fs/nfsd/nfs4xdr.c | 24 ++++++++++++------------ fs/nfsd/nfsxdr.c | 11 ++++++----- fs/nfsd/vfs.c | 18 ++++++++---------- 7 files changed, 47 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b314888825d5..9170861c804a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -253,7 +253,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { - if (!rqstp->rq_respages[rqstp->rq_resused++]) + if (!*(rqstp->rq_next_page++)) return 0; w -= PAGE_SIZE; } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a596e9d987e4..9cbc1a841f87 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { - if (!rqstp->rq_respages[rqstp->rq_resused++]) + if (!*(rqstp->rq_next_page++)) return 0; w -= PAGE_SIZE; } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 97d90d1c8608..1fc02dfdc5c4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -460,7 +460,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, __be32 nfserr; int count = 0; loff_t offset; - int i; + struct page **p; caddr_t page_addr = NULL; dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", @@ -484,8 +484,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, &resp->common, nfs3svc_encode_entry_plus); memcpy(resp->verf, argp->verf, 8); - for (i=1; irq_resused ; i++) { - page_addr = page_address(rqstp->rq_respages[i]); + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); if (((caddr_t)resp->buffer >= page_addr) && ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2b8618de6c27..324c0baf7cda 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -325,7 +325,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readargs *args) { unsigned int len; - int v,pn; + int v; u32 max_blocksize = svc_max_payload(rqstp); if (!(p = decode_fh(p, &args->fh))) @@ -340,8 +340,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, /* set up the kvec */ v=0; while (len > 0) { - pn = rqstp->rq_resused++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; len -= rqstp->rq_vec[v].iov_len; v++; @@ -463,8 +464,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, len = ntohl(*p++); if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) return 0; - args->tname = new = - page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->tname = new = page_address(*(rqstp->rq_next_page++)); args->tlen = len; /* first copy and check from the first page */ old = (char*)p; @@ -535,8 +535,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, { if (!(p = decode_fh(p, &args->fh))) return 0; - args->buffer = - page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); } @@ -567,8 +566,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, if (args->count > PAGE_SIZE) args->count = PAGE_SIZE; - args->buffer = - page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); } @@ -577,7 +575,7 @@ int nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readdirargs *args) { - int len, pn; + int len; u32 max_blocksize = svc_max_payload(rqstp); if (!(p = decode_fh(p, &args->fh))) @@ -592,9 +590,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->count = len; while (len > 0) { - pn = rqstp->rq_resused++; + struct page *p = *(rqstp->rq_next_page++); if (!args->buffer) - args->buffer = page_address(rqstp->rq_respages[pn]); + args->buffer = page_address(p); len -= PAGE_SIZE; } @@ -880,7 +878,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, common); __be32 *p = cd->buffer; caddr_t curr_page_addr = NULL; - int pn; /* current page number */ + struct page ** page; int slen; /* string (name) length */ int elen; /* estimated entry length in words */ int num_entry_words = 0; /* actual number of words */ @@ -917,8 +915,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } /* determine which page in rq_respages[] we are currently filling */ - for (pn=1; pn < cd->rqstp->rq_resused; pn++) { - curr_page_addr = page_address(cd->rqstp->rq_respages[pn]); + for (page = cd->rqstp->rq_respages + 1; + page < cd->rqstp->rq_next_page; page++) { + curr_page_addr = page_address(*page); if (((caddr_t)cd->buffer >= curr_page_addr) && ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) @@ -933,14 +932,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, if (plus) p = encode_entryplus_baggage(cd, p, name, namlen); num_entry_words = p - cd->buffer; - } else if (cd->rqstp->rq_respages[pn+1] != NULL) { + } else if (*(page+1) != NULL) { /* temporarily encode entry into next page, then move back to * current and next page in rq_respages[] */ __be32 *p1, *tmp; int len1, len2; /* grab next page for temporary storage of entry */ - p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]); + p1 = tmp = page_address(*(page+1)); p1 = encode_entry_baggage(cd, p1, name, namlen, ino); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d7a3be5ab777..0dc11586682f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2906,7 +2906,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { u32 eof; - int v, pn; + int v; + struct page *page; unsigned long maxcount; long len; __be32 *p; @@ -2925,16 +2926,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, len = maxcount; v = 0; while (len > 0) { - pn = resp->rqstp->rq_resused; - if (!resp->rqstp->rq_respages[pn]) { /* ran out of pages */ + page = *(resp->rqstp->rq_next_page); + if (!page) { /* ran out of pages */ maxcount -= len; break; } - resp->rqstp->rq_vec[v].iov_base = - page_address(resp->rqstp->rq_respages[pn]); + resp->rqstp->rq_vec[v].iov_base = page_address(page); resp->rqstp->rq_vec[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE; - resp->rqstp->rq_resused++; + resp->rqstp->rq_next_page++; v++; len -= PAGE_SIZE; } @@ -2980,10 +2980,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd return nfserr; if (resp->xbuf->page_len) return nfserr_resource; - if (!resp->rqstp->rq_respages[resp->rqstp->rq_resused]) + if (!*resp->rqstp->rq_next_page) return nfserr_resource; - page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); + page = page_address(*(resp->rqstp->rq_next_page++)); maxcount = PAGE_SIZE; RESERVE_SPACE(4); @@ -3031,7 +3031,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 return nfserr; if (resp->xbuf->page_len) return nfserr_resource; - if (!resp->rqstp->rq_respages[resp->rqstp->rq_resused]) + if (!*resp->rqstp->rq_next_page) return nfserr_resource; RESERVE_SPACE(NFS4_VERIFIER_SIZE); @@ -3059,7 +3059,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 goto err_no_verf; } - page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); + page = page_address(*(resp->rqstp->rq_next_page++)); readdir->common.err = 0; readdir->buflen = maxcount; readdir->buffer = page; @@ -3082,8 +3082,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 p = readdir->buffer; *p++ = 0; /* no more entries */ *p++ = htonl(readdir->common.err == nfserr_eof); - resp->xbuf->page_len = ((char*)p) - (char*)page_address( - resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); + resp->xbuf->page_len = ((char*)p) - + (char*)page_address(*(resp->rqstp->rq_next_page-1)); /* Use rest of head for padding and remaining ops: */ resp->xbuf->tail[0].iov_base = tailbase; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 65ec595e2226..979b42106979 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -246,7 +246,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readargs *args) { unsigned int len; - int v,pn; + int v; if (!(p = decode_fh(p, &args->fh))) return 0; @@ -262,8 +262,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, */ v=0; while (len > 0) { - pn = rqstp->rq_resused++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; len -= rqstp->rq_vec[v].iov_len; v++; @@ -355,7 +356,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli { if (!(p = decode_fh(p, &args->fh))) return 0; - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); } @@ -396,7 +397,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, if (args->count > PAGE_SIZE) args->count = PAGE_SIZE; - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b31e46eeb026..f0a6d88d7fff 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - struct page **pp = rqstp->rq_respages + rqstp->rq_resused; + struct page **pp = rqstp->rq_next_page; struct page *page = buf->page; size_t size; @@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, if (rqstp->rq_res.page_len == 0) { get_page(page); - put_page(*pp); - *pp = page; - rqstp->rq_resused++; + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; rqstp->rq_res.page_base = buf->offset; rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { get_page(page); - if (*pp) - put_page(*pp); - *pp = page; - rqstp->rq_resused++; + if (*rqstp->rq_next_page) + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; rqstp->rq_res.page_len += size; } else rqstp->rq_res.page_len += size; @@ -936,8 +934,8 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, .u.data = rqstp, }; - WARN_ON_ONCE(rqstp->rq_resused != 1); - rqstp->rq_resused = 1; + WARN_ON_ONCE(rqstp->rq_next_page != rqstp->rq_respages + 1); + rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { oldfs = get_fs(); -- cgit v1.2.1 From a1dc6955829f20ad80c1d6a411ecbcf538bb1410 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Dec 2012 18:17:13 -0500 Subject: nfsd4: free_stateid can use the current stateid Cc: Tigran Mkrtchyan Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2a2d9b06a413..9d1c5dba2bbb 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1766,6 +1766,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_FREE_STATEID", + .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, }; -- cgit v1.2.1 From 24ffb93872f7363a01ad639e3c8a9889b46c3f0a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Dec 2012 15:24:12 -0500 Subject: nfsd4: don't leave freed stateid hashed Note the stateid is hashed early on in init_stid(), but isn't currently being unhashed on error paths. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8e127b39d323..ac8ed96c4199 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2983,6 +2983,7 @@ out: } return; out_free: + unhash_stid(&dp->dl_stid); nfs4_put_delegation(dp); out_no_deleg: flag = NFS4_OPEN_DELEGATE_NONE; -- cgit v1.2.1 From 8bbca57cff7f1b1fd046eebd1e9497a00161c2c1 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 21 Aug 2012 10:46:05 +0800 Subject: eCryptfs: fix to use list_for_each_entry_safe() when delete items Since we will be removing items off the list using list_del() we need to use a safer version of the list_for_each_entry() macro aptly named list_for_each_entry_safe(). We should use the safe macro if the loop involves deletions of items. Signed-off-by: Wei Yongjun [tyhicks: Fixed compiler err - missing list_for_each_entry_safe() param] Signed-off-by: Tyler Hicks --- fs/ecryptfs/kthread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 809e67d05ca3..f1ea610362c6 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -102,12 +102,12 @@ int __init ecryptfs_init_kthread(void) void ecryptfs_destroy_kthread(void) { - struct ecryptfs_open_req *req; + struct ecryptfs_open_req *req, *tmp; mutex_lock(&ecryptfs_kthread_ctl.mux); ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; - list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, - kthread_ctl_list) { + list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { list_del(&req->kthread_ctl_list); *req->lower_file = ERR_PTR(-EIO); complete(&req->done); -- cgit v1.2.1 From 37028758f92d0a3eb74bcfbecf6bc477072e9e28 Mon Sep 17 00:00:00 2001 From: Cong Ding Date: Fri, 7 Dec 2012 22:21:56 +0000 Subject: fs/ecryptfs/crypto.c: make ecryptfs_encode_for_filename() static the function ecryptfs_encode_for_filename() is only used in this file Signed-off-by: Cong Ding Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ea9931281557..a7b0c2dfb3db 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1935,7 +1935,7 @@ static const unsigned char filename_rev_map[256] = { * @src: Source location for the filename to encode * @src_size: Size of the source in bytes */ -void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, +static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, unsigned char *src, size_t src_size) { size_t num_blocks; -- cgit v1.2.1 From 4c3e696981a565aace08678e70c40709a85f9b2b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2012 15:43:18 -0500 Subject: Revert "Btrfs: MOD_LOG_KEY_REMOVE_WHILE_MOVING never change node's nritems" This reverts commit 95c80bb1f6b24b57058d971ed252b2c1c5121b51. The bug addressed by this commit was fixed differently back in 3.6 Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c7b67cf24bba..569c0dfb526c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1138,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - case MOD_LOG_KEY_REMOVE: - n++; case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); + n++; break; case MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); -- cgit v1.2.1 From f6af75dac3978d0b4d83939cb5d244b2a844820e Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Tue, 18 Dec 2012 14:21:23 -0800 Subject: ceph: fix dentry reference leak in ceph_encode_fh() dput() was not called in the error path. Signed-off-by: Cyril Roelandt Cc: Sage Weil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/export.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9349bb37a2fe..ca3ab3f9ca70 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry = d_find_alias(inode); + struct dentry *dentry; struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + dentry = d_find_alias(inode); + /* if we found an alias, generate a connectable fh */ if (*max_len >= connected_handle_length && dentry) { dout("encode_fh %p connectable\n", dentry); -- cgit v1.2.1 From 57ba86c00f9573b63b8c06810d4f6915efed2442 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2012 19:35:32 -0500 Subject: Revert "Btrfs: reorder tree mod log operations in deleting a pointer" This reverts commit 6a7a665d78c5dd8bc76a010648c4e7d84517ab5a. This was bug was fixed differently in 3.6, so this commit isn't needed. Conflicts: fs/btrfs/ctree.c Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 569c0dfb526c..eea5da7a2b9a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4611,12 +4611,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; int ret; - if (level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - } - nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) @@ -4627,6 +4621,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); } nritems--; -- cgit v1.2.1 From ae903caae267154de7cf8576b130ff474630596b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 12:44:11 -0500 Subject: Bury the conditionals from kernel_thread/kernel_execve series All architectures have CONFIG_GENERIC_KERNEL_THREAD CONFIG_GENERIC_KERNEL_EXECVE __ARCH_WANT_SYS_EXECVE None of them have __ARCH_WANT_KERNEL_EXECVE and there are only two callers of kernel_execve() (which is a trivial wrapper for do_execve() now) left. Kill the conditionals and make both callers use do_execve(). Signed-off-by: Al Viro --- fs/exec.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 721a29929511..090ac91da2e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1657,7 +1657,6 @@ int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -#ifdef __ARCH_WANT_SYS_EXECVE SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, @@ -1685,23 +1684,3 @@ asmlinkage long compat_sys_execve(const char __user * filename, return error; } #endif -#endif - -#ifdef __ARCH_WANT_KERNEL_EXECVE -int kernel_execve(const char *filename, - const char *const argv[], - const char *const envp[]) -{ - int ret = do_execve(filename, - (const char __user *const __user *)argv, - (const char __user *const __user *)envp); - if (ret < 0) - return ret; - - /* - * We were successful. We won't be returning to our caller, but - * instead to user space by manipulating the kernel stack. - */ - ret_from_kernel_execve(current_pt_regs()); -} -#endif -- cgit v1.2.1 From 261cb20cb2f0737a247aaf08dff7eb065e3e5b66 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 20 Dec 2012 00:07:18 -0500 Subject: ext4: check dioread_nolock on remount Currently we allow enabling dioread_nolock mount option on remount for filesystems where blocksize < PAGE_CACHE_SIZE. This isn't really supported so fix the bug by moving the check for blocksize != PAGE_CACHE_SIZE into parse_options(). Change the original PAGE_SIZE to PAGE_CACHE_SIZE along the way because that's what we are really interested in. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Eric Sandeen Cc: stable@vger.kernel.org --- fs/ext4/super.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3cdb0a2fc648..e09f7d1646ba 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1645,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { -#ifdef CONFIG_QUOTA struct ext4_sb_info *sbi = EXT4_SB(sb); -#endif char *p; substring_t args[MAX_OPT_ARGS]; int token; @@ -1696,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb, } } #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (blocksize < PAGE_CACHE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + return 0; + } + } return 1; } @@ -3436,15 +3444,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) clear_opt(sb, DELALLOC); } - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - goto failed_mount; - } - } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3486,6 +3485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, -- cgit v1.2.1 From 8367224b2e90eb716dc54f3d83cd73b7efb2ea30 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 13 Dec 2012 06:02:51 -0500 Subject: cifs: fix double-free of "string" in cifs_parse_mount_options Dan reported the following regression in commit d387a5c5: + fs/cifs/connect.c:1903 cifs_parse_mount_options() error: double free of 'string' That patch has some of the new option parsing code free "string" without setting the variable to NULL afterward. Since "string" is automatically freed in an error condition, fix the code to just rely on that instead of freeing it explicitly. Reported-by: Dan Carpenter Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7635b5db26a7..17c3643e5950 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1624,14 +1624,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_unc: string = vol->UNC; vol->UNC = match_strdup(args); - if (vol->UNC == NULL) { - kfree(string); + if (vol->UNC == NULL) goto out_nomem; - } convert_delimiter(vol->UNC, '\\'); if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { - kfree(string); printk(KERN_ERR "CIFS: UNC Path does not " "begin with // or \\\\\n"); goto cifs_parse_mount_err; @@ -1687,10 +1684,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, string = vol->prepath; vol->prepath = match_strdup(args); - if (vol->prepath == NULL) { - kfree(string); + if (vol->prepath == NULL) goto out_nomem; - } /* Compare old prefixpath= option to new one */ if (!string || strcmp(string, vol->prepath)) printk(KERN_WARNING "CIFS: the value of the " -- cgit v1.2.1 From 2f2591a34db6c9361faa316c91a6e320cb4e6aee Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Dec 2012 06:35:10 -0500 Subject: cifs: don't compare uniqueids in cifs_prime_dcache unless server inode numbers are in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Oliver reported that commit cd60042c caused his cifs mounts to continually thrash through new inodes on readdir. His servers are not sending inode numbers (or he's not using them), and the new test in that function doesn't account for that sort of setup correctly. If we're not using server inode numbers, then assume that the inode attached to the dentry hasn't changed. Go ahead and update the attributes in place, but keep the same inode number. Cc: # v3.5+ Reported-and-Tested-by: Oliver Mössinger Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6002fdc920ae..cdd6ff48246b 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,6 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct dentry *dentry, *alias; struct inode *inode; struct super_block *sb = parent->d_inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); cFYI(1, "%s: for %s", __func__, name->name); @@ -91,10 +92,20 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, int err; inode = dentry->d_inode; - /* update inode in place if i_ino didn't change */ - if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { - cifs_fattr_to_inode(inode, fattr); - goto out; + if (inode) { + /* + * If we're generating inode numbers, then we don't + * want to clobber the existing one with the one that + * the readdir code created. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) + fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; + + /* update inode in place if i_ino didn't change */ + if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + cifs_fattr_to_inode(inode, fattr); + goto out; + } } err = d_invalidate(dentry); dput(dentry); -- cgit v1.2.1 From 9acbd26b0a5ac4a3d52d31034feb3d935e39032a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Dec 2012 06:35:10 -0500 Subject: cifs: eliminate cifsERROR variable It's always set to "1" and there's no way to change it to anything else. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_debug.h | 6 +----- fs/cifs/cifsfs.c | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 86e92ef2abc1..69ae3d3c3b31 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -37,7 +37,6 @@ void dump_smb(void *, int); #define CIFS_TIMER 0x04 extern int cifsFYI; -extern int cifsERROR; /* * debug ON @@ -64,10 +63,7 @@ do { \ /* error event message: e.g., i/o error */ #define cifserror(fmt, ...) \ -do { \ - if (cifsERROR) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ -} while (0) + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ #define cERROR(set, fmt, ...) \ do { \ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ce9f3c5421bf..f653835d067b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -54,7 +54,6 @@ #endif int cifsFYI = 0; -int cifsERROR = 1; int traceSMB = 0; bool enable_oplocks = true; unsigned int linuxExtEnabled = 1; -- cgit v1.2.1 From 1e75529e3c6c18dc535f38454173c4f2dfa99685 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 16 Nov 2012 17:23:50 +0800 Subject: vfs, freeze: use ACCESS_ONCE() to guard access to ->mnt_flags The compiler may optimize the while loop and make the check just be done once, so we should use ACCESS_ONCE() to guard access to ->mnt_flags Signed-off-by: Miao Xie Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 398a50ff2438..55605c552787 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will -- cgit v1.2.1 From 582aa64a04a579d47d05e4a0ee85bf047978ef4d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 08:56:16 -0500 Subject: vfs: remove unneeded permission check from path_init When path_init is called with a valid dfd, that code checks permissions on the open directory fd and returns an error if the check fails. This permission check is redundant, however. Both callers of path_init immediately call link_path_walk afterward. The first thing that link_path_walk does for pathnames that do not consist only of slashes is to check for exec permissions at the starting point of the path walk. And this check in path_init() is on the path taken only when *name != '/' && *name != '\0'. In most cases, these checks are very quick, but when the dfd is for a file on a NFS mount with the actimeo=0, each permission check goes out onto the wire. The result is 2 identical ACCESS calls. Given that these codepaths are fairly "hot", I think it makes sense to eliminate the permission check in path_init and simply assume that the caller will eventually check the permissions before proceeding. Reported-by: Dave Wysochanski Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5f4cdf3ad913..e245d88b4d69 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1903,6 +1903,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, get_fs_pwd(current->fs, &nd->path); } } else { + /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(dfd); struct dentry *dentry; @@ -1916,12 +1917,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, fdput(f); return -ENOTDIR; } - - retval = inode_permission(dentry->d_inode, MAY_EXEC); - if (retval) { - fdput(f); - return retval; - } } nd->path = f.file->f_path; -- cgit v1.2.1 From 741b7c3f77937b2fb7c10aeb4c5c621463582583 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Dec 2012 13:41:28 -0500 Subject: path_init(): make -ENOTDIR failure exits consistent Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e245d88b4d69..35195ff9d194 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1859,7 +1859,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (flags & LOOKUP_ROOT) { struct inode *inode = nd->root.dentry->d_inode; if (*name) { - if (!inode->i_op->lookup) + if (!can_lookup(inode)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1913,7 +1913,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = f.file->f_path.dentry; if (*name) { - if (!S_ISDIR(dentry->d_inode->i_mode)) { + if (!can_lookup(dentry->d_inode)) { fdput(f); return -ENOTDIR; } -- cgit v1.2.1 From 39e3c9553f34381a1b664c27b0c696a266a5735e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 28 Nov 2012 11:30:53 -0500 Subject: vfs: remove DCACHE_NEED_LOOKUP The code that relied on that flag was ripped out of btrfs quite some time ago, and never added back. Josef indicated that he was going to take a different approach to the problem in btrfs, and that we could just eliminate this flag. Cc: Josef Bacik Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/btrfs/inode.c | 16 +--------------- fs/dcache.c | 33 +-------------------------------- fs/namei.c | 11 +---------- 3 files changed, 3 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86bb..16d9e8e191e6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4262,16 +4262,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - if (unlikely(d_need_lookup(dentry))) { - memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); - kfree(dentry->d_fsdata); - dentry->d_fsdata = NULL; - /* This thing is hashed, drop it for now */ - d_drop(dentry); - } else { - ret = btrfs_inode_by_name(dir, dentry, &location); - } - + ret = btrfs_inode_by_name(dir, dentry, &location); if (ret < 0) return ERR_PTR(ret); @@ -4341,11 +4332,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct dentry *ret; ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - if (unlikely(d_need_lookup(dentry))) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); - } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index 3a463d0c4fe8..1782be3fc3ef 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -454,24 +454,6 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag - * @dentry: dentry to drop - * - * This is called when we do a lookup on a placeholder dentry that needed to be - * looked up. The dentry should have been hashed in order for it to be found by - * the lookup code, but now needs to be unhashed while we do the actual lookup - * and clear the DCACHE_NEED_LOOKUP flag. - */ -void d_clear_need_lookup(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __d_drop(dentry); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_clear_need_lookup); - /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. @@ -565,13 +547,7 @@ repeat: if (d_unhashed(dentry)) goto kill_it; - /* - * If this dentry needs lookup, don't set the referenced flag so that it - * is more likely to be cleaned up by the dcache shrinker in case of - * memory pressure. - */ - if (!d_need_lookup(dentry)) - dentry->d_flags |= DCACHE_REFERENCED; + dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_count--; @@ -1736,13 +1712,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, return found; } - /* - * We are going to instantiate this dentry, unhash it and clear the - * lookup flag so we can do that. - */ - if (unlikely(d_need_lookup(found))) - d_clear_need_lookup(found); - /* * Negative dentry: instantiate it unless the inode is a directory and * already has a dentry. diff --git a/fs/namei.c b/fs/namei.c index 35195ff9d194..25a41e02984b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1275,9 +1275,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, *need_lookup = false; dentry = d_lookup(dir, name); if (dentry) { - if (d_need_lookup(dentry)) { - *need_lookup = true; - } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { + if (dentry->d_flags & DCACHE_OP_REVALIDATE) { error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (error < 0) { @@ -1383,8 +1381,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; - if (unlikely(d_need_lookup(dentry))) - goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1410,11 +1406,6 @@ unlazy: if (unlikely(!dentry)) goto need_lookup; - if (unlikely(d_need_lookup(dentry))) { - dput(dentry); - goto need_lookup; - } - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { -- cgit v1.2.1 From 72651cac884b1e285fa8e8314b10e9f1b8458802 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 5 Dec 2012 14:40:14 +0100 Subject: fs: Fix imbalance in freeze protection in mark_files_ro() File descriptors (even those for writing) do not hold freeze protection. Thus mark_files_ro() must call __mnt_drop_write() to only drop protection against remount read-only. Calling mnt_drop_write_file() as we do now results in: [ BUG: bad unlock balance detected! ] 3.7.0-rc6-00028-g88e75b6 #101 Not tainted ------------------------------------- kworker/1:2/79 is trying to release lock (sb_writers) at: [] mnt_drop_write+0x24/0x30 but there are no more locks to release! Reported-by: Zdenek Kabelac CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index a72bf9ddd0d2..de9e9653d611 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -458,8 +458,8 @@ void mark_files_ro(struct super_block *sb) spin_unlock(&f->f_lock); if (file_check_writeable(f) != 0) continue; + __mnt_drop_write(f->f_path.mnt); file_release_write(f); - mnt_drop_write_file(f); } while_file_list_for_each_entry; lg_global_unlock(&files_lglock); } -- cgit v1.2.1 From 83f6e3710a932d400100767ad445a4bd9476e083 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:45:14 +0100 Subject: ufs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/ufs/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index eb6d0b7dc879..ff24e4449ece 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ufs_getfrag_block); } +static void ufs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + static int ufs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, ufs_getfrag_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + ufs_write_failed(mapping, pos + len); return ret; } -- cgit v1.2.1 From fa4d62ae17c7415f1ea824076870b7ad9b51fd06 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:45:58 +0100 Subject: sysv: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/sysv/file.c | 5 +++-- fs/sysv/itree.c | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 0a65939508e9..9d4dc6831792 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + truncate_setsize(inode, attr->ia_size); + sysv_truncate(inode); } setattr_copy(inode, attr); @@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr) } const struct inode_operations sysv_file_inode_operations = { - .truncate = sysv_truncate, .setattr = sysv_setattr, .getattr = sysv_getattr, }; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 90b54b438789..c1a591a4725b 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, get_block); } +static void sysv_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + sysv_truncate(inode); + } +} + static int sysv_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping, int ret; ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + sysv_write_failed(mapping, pos + len); return ret; } -- cgit v1.2.1 From cfac4b47c664e207740880d6492938761c53d74b Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:47:31 +0100 Subject: reiserfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/reiserfs/file.c | 3 +-- fs/reiserfs/inode.c | 15 +++++++++++---- fs/reiserfs/reiserfs.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 8375c922c0d5..50302d6f8895 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file) return err; } -static void reiserfs_vfs_truncate_file(struct inode *inode) +void reiserfs_vfs_truncate_file(struct inode *inode) { mutex_lock(&(REISERFS_I(inode)->tailpack)); reiserfs_truncate_file(inode, 1); @@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = { }; const struct inode_operations reiserfs_file_inode_operations = { - .truncate = reiserfs_vfs_truncate_file, .setattr = reiserfs_setattr, .setxattr = reiserfs_setxattr, .getxattr = reiserfs_getxattr, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d83736fbc26c..95d7680ead47 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, loff_t isize = i_size_read(inode); loff_t end = offset + iov_length(iov, nr_segs); - if (end > isize) - vmtruncate(inode, isize); + if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { + truncate_setsize(inode, isize); + reiserfs_vfs_truncate_file(inode); + } } return ret; @@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) */ reiserfs_write_unlock_once(inode->i_sb, depth); if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) - error = vmtruncate(inode, attr->ia_size); + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (!error) { + truncate_setsize(inode, attr->ia_size); + reiserfs_vfs_truncate_file(inode); + } + } if (!error) { setattr_copy(inode, attr); diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 33215f57ea06..157e474ab303 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct *, int count); int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +void reiserfs_vfs_truncate_file(struct inode *inode); int reiserfs_commit_page(struct inode *inode, struct page *page, unsigned from, unsigned to); void reiserfs_flush_old_commits(struct super_block *); -- cgit v1.2.1 From 46f69557103e11fb963ae5c98b7777e90493241b Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:48:48 +0100 Subject: procfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/proc/base.c | 7 ------- fs/proc/generic.c | 9 +-------- fs/proc/proc_sysctl.c | 7 ------- 3 files changed, 1 insertion(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a5a0be40e40..9b43ff77a51e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); return 0; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7b3ae3cc0ef9..2e4ed13b9eed 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, iattr->ia_size); - if (error) - return error; - } - setattr_copy(inode, iattr); mark_inode_dirty(inode); - + de->uid = inode->i_uid; de->gid = inode->i_gid; de->mode = inode->i_mode; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 701580ddfcc3..1827d88ad58b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); return 0; -- cgit v1.2.1 From a8f5293aac161f9dfd70d0c03c3e407d417fafe1 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:49:42 +0100 Subject: omfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Acked-by: Bob Copeland Signed-off-by: Al Viro --- fs/omfs/file.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 77e3cb2962b4..e0d9b3e722bd 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) return mpage_writepages(mapping, wbc, omfs_get_block); } +static void omfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + omfs_truncate(inode); + } +} + static int omfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, omfs_get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + omfs_write_failed(mapping, pos + len); return ret; } @@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + truncate_setsize(inode, attr->ia_size); + omfs_truncate(inode); } setattr_copy(inode, attr); @@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) const struct inode_operations omfs_file_inops = { .setattr = omfs_setattr, - .truncate = omfs_truncate }; const struct address_space_operations omfs_aops = { -- cgit v1.2.1 From a6ff03771e9d4a2a64cd1414e32c6b369ae935ba Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:50:20 +0100 Subject: ocfs2: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/ocfs2/file.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fe492e1a3cfc..37d313ede159 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1218,24 +1218,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - /* - * This will intentionally not wind up calling truncate_setsize(), - * since all the work for a size change has been done above. - * Otherwise, we could get into problems with truncate as - * ip_alloc_sem is used there to protect against i_size - * changes. - * - * XXX: this means the conditional below can probably be removed. - */ - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - status = vmtruncate(inode, attr->ia_size); - if (status) { - mlog_errno(status); - goto bail_commit; - } - } - setattr_copy(inode, attr); mark_inode_dirty(inode); -- cgit v1.2.1 From 62295183846234b423a67116c36e00bd29dfd856 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:51:11 +0100 Subject: adfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/adfs/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index e9bad5093a3f..5f95d1ed9c6d 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, adfs_get_block); } +static void adfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + static int adfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + adfs_write_failed(mapping, pos + len); return ret; } -- cgit v1.2.1 From 1dc1834f4292624f46da7e0309bc04a3cca1b07c Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:51:53 +0100 Subject: affs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/affs/file.c | 18 ++++++++++++------ fs/affs/inode.c | 5 ++++- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/affs/file.c b/fs/affs/file.c index 2f4c935cb327..af3261b78102 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = { }; const struct inode_operations affs_file_inode_operations = { - .truncate = affs_truncate, .setattr = affs_notify_change, }; @@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page) return block_read_full_page(page, affs_get_block); } +static void affs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + affs_truncate(inode); + } +} + static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + affs_write_failed(mapping, pos + len); return ret; } diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 15c484268229..0e092d08680e 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + + truncate_setsize(inode, attr->ia_size); + affs_truncate(inode); } setattr_copy(inode, attr); -- cgit v1.2.1 From 41ddaeeb9d02ca12fee80b4eb23ab388b3ebe69d Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:52:33 +0100 Subject: bfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/bfs/file.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bfs/file.c b/fs/bfs/file.c index f20e8a71062f..ad3ea1497cc3 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, bfs_get_block); } +static void bfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, bfs_get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + bfs_write_failed(mapping, pos + len); return ret; } -- cgit v1.2.1 From c8cf464bc5cfa689357796db2294c4b2474095fb Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:53:15 +0100 Subject: hfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/hfs/inode.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 0b35903219bc..d47f11658c17 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, hfs_get_block); } +static void hfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + hfs_file_truncate(inode); + } +} + static int hfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + hfs_write_failed(mapping, pos + len); return ret; } @@ -120,6 +127,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; ssize_t ret; @@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + hfs_write_failed(mapping, end); } return ret; @@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) attr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + + truncate_setsize(inode, attr->ia_size); + hfs_file_truncate(inode); } setattr_copy(inode, attr); @@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = { static const struct inode_operations hfs_file_inode_operations = { .lookup = hfs_file_lookup, - .truncate = hfs_file_truncate, .setattr = hfs_inode_setattr, .setxattr = hfs_setxattr, .getxattr = hfs_getxattr, -- cgit v1.2.1 From c4d6d8dbf335c7fa47341654a37c53a512b519bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:32 +0000 Subject: CacheFiles: Fix the marking of cached pages Under some circumstances CacheFiles defers the marking of pages with PG_fscache so that it can take advantage of pagevecs to reduce the number of calls to fscache_mark_pages_cached() and the netfs's hook to keep track of this. There are, however, two problems with this: (1) It can lead to the PG_fscache mark being applied _after_ the page is set PG_uptodate and unlocked (by the call to fscache_end_io()). (2) CacheFiles's ref on the page is dropped immediately following fscache_end_io() - and so may not still be held when the mark is applied. This can lead to the page being passed back to the allocator before the mark is applied. Fix this by, where appropriate, marking the page before calling fscache_end_io() and releasing the page. This means that we can't take advantage of pagevecs and have to make a separate call for each page to the marking routines. The symptoms of this are Bad Page state errors cropping up under memory pressure, for example: BUG: Bad page state in process tar pfn:002da page:ffffea0000009fb0 count:0 mapcount:0 mapping: (null) index:0x1447 page flags: 0x1000(private_2) Pid: 4574, comm: tar Tainted: G W 3.1.0-rc4-fsdevel+ #1064 Call Trace: [] ? dump_page+0xb9/0xbe [] bad_page+0xd5/0xea [] get_page_from_freelist+0x35b/0x46a [] __alloc_pages_nodemask+0x362/0x662 [] __do_page_cache_readahead+0x13a/0x267 [] ? __do_page_cache_readahead+0xa2/0x267 [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x28b/0x29a [] ? ondemand_readahead+0x163/0x29a [] page_cache_sync_readahead+0x38/0x3a [] generic_file_aio_read+0x2ab/0x67e [] nfs_file_read+0xa4/0xc9 [nfs] [] do_sync_read+0xba/0xfa [] ? security_file_permission+0x7b/0x84 [] ? rw_verify_area+0xab/0xc8 [] vfs_read+0xaa/0x13a [] sys_read+0x45/0x6c [] system_call_fastpath+0x16/0x1b As can be seen, PG_private_2 (== PG_fscache) is set in the page flags. Instrumenting fscache_mark_pages_cached() to verify whether page->mapping was set appropriately showed that sometimes it wasn't. This led to the discovery that sometimes the page has apparently been reclaimed by the time the marker got to see it. Reported-by: M. Stevens Signed-off-by: David Howells Reviewed-by: Jeff Layton --- fs/cachefiles/rdwr.c | 34 ++++++++++-------------------- fs/fscache/page.c | 59 ++++++++++++++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c994691d9445..3367abdcdac4 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -176,9 +176,8 @@ static void cachefiles_read_copier(struct fscache_operation *_op) recheck: if (PageUptodate(monitor->back_page)) { copy_highpage(monitor->netfs_page, monitor->back_page); - - pagevec_add(&pagevec, monitor->netfs_page); - fscache_mark_pages_cached(monitor->op, &pagevec); + fscache_mark_page_cached(monitor->op, + monitor->netfs_page); error = 0; } else if (!PageError(monitor->back_page)) { /* the page has probably been truncated */ @@ -335,8 +334,7 @@ backing_page_already_present: backing_page_already_uptodate: _debug("- uptodate"); - pagevec_add(pagevec, netpage); - fscache_mark_pages_cached(op, pagevec); + fscache_mark_page_cached(op, netpage); copy_highpage(netpage, backpage); fscache_end_io(op, netpage, 0); @@ -448,8 +446,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, &pagevec); } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ - pagevec_add(&pagevec, page); - fscache_mark_pages_cached(op, &pagevec); + fscache_mark_page_cached(op, page); ret = -ENODATA; } else { ret = -ENOBUFS; @@ -465,8 +462,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, */ static int cachefiles_read_backing_file(struct cachefiles_object *object, struct fscache_retrieval *op, - struct list_head *list, - struct pagevec *mark_pvec) + struct list_head *list) { struct cachefiles_one_read *monitor = NULL; struct address_space *bmapping = object->backer->d_inode->i_mapping; @@ -626,13 +622,13 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, page_cache_release(backpage); backpage = NULL; - if (!pagevec_add(mark_pvec, netpage)) - fscache_mark_pages_cached(op, mark_pvec); + fscache_mark_page_cached(op, netpage); page_cache_get(netpage); if (!pagevec_add(&lru_pvec, netpage)) __pagevec_lru_add_file(&lru_pvec); + /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); page_cache_release(netpage); netpage = NULL; @@ -775,15 +771,11 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, /* submit the apparently valid pages to the backing fs to be read from * disk */ if (nrbackpages > 0) { - ret2 = cachefiles_read_backing_file(object, op, &backpages, - &pagevec); + ret2 = cachefiles_read_backing_file(object, op, &backpages); if (ret2 == -ENOMEM || ret2 == -EINTR) ret = ret2; } - if (pagevec_count(&pagevec) > 0) - fscache_mark_pages_cached(op, &pagevec); - _leave(" = %d [nr=%u%s]", ret, *nr_pages, list_empty(pages) ? " empty" : ""); return ret; @@ -806,7 +798,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op, { struct cachefiles_object *object; struct cachefiles_cache *cache; - struct pagevec pagevec; int ret; object = container_of(op->op.object, @@ -817,13 +808,10 @@ int cachefiles_allocate_page(struct fscache_retrieval *op, _enter("%p,{%lx},", object, page->index); ret = cachefiles_has_space(cache, 0, 1); - if (ret == 0) { - pagevec_init(&pagevec, 0); - pagevec_add(&pagevec, page); - fscache_mark_pages_cached(op, &pagevec); - } else { + if (ret == 0) + fscache_mark_page_cached(op, page); + else ret = -ENOBUFS; - } _leave(" = %d", ret); return ret; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3f7a59bfa7ad..d7c663cfc923 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -914,6 +914,40 @@ done: } EXPORT_SYMBOL(__fscache_uncache_page); +/** + * fscache_mark_page_cached - Mark a page as being cached + * @op: The retrieval op pages are being marked for + * @page: The page to be marked + * + * Mark a netfs page as being cached. After this is called, the netfs + * must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) +{ + struct fscache_cookie *cookie = op->op.object->cookie; + +#ifdef CONFIG_FSCACHE_STATS + atomic_inc(&fscache_n_marks); +#endif + + _debug("- mark %p{%lx}", page, page->index); + if (TestSetPageFsCache(page)) { + static bool once_only; + if (!once_only) { + once_only = true; + printk(KERN_WARNING "FS-Cache:" + " Cookie type %s marked page %lx" + " multiple times\n", + cookie->def->name, page->index); + } + } + + if (cookie->def->mark_page_cached) + cookie->def->mark_page_cached(cookie->netfs_data, + op->mapping, page); +} +EXPORT_SYMBOL(fscache_mark_page_cached); + /** * fscache_mark_pages_cached - Mark pages as being cached * @op: The retrieval op pages are being marked for @@ -925,32 +959,11 @@ EXPORT_SYMBOL(__fscache_uncache_page); void fscache_mark_pages_cached(struct fscache_retrieval *op, struct pagevec *pagevec) { - struct fscache_cookie *cookie = op->op.object->cookie; unsigned long loop; -#ifdef CONFIG_FSCACHE_STATS - atomic_add(pagevec->nr, &fscache_n_marks); -#endif - - for (loop = 0; loop < pagevec->nr; loop++) { - struct page *page = pagevec->pages[loop]; - - _debug("- mark %p{%lx}", page, page->index); - if (TestSetPageFsCache(page)) { - static bool once_only; - if (!once_only) { - once_only = true; - printk(KERN_WARNING "FS-Cache:" - " Cookie type %s marked page %lx" - " multiple times\n", - cookie->def->name, page->index); - } - } - } + for (loop = 0; loop < pagevec->nr; loop++) + fscache_mark_page_cached(op, pagevec->pages[loop]); - if (cookie->def->mark_pages_cached) - cookie->def->mark_pages_cached(cookie->netfs_data, - op->mapping, pagevec); pagevec_reinit(pagevec); } EXPORT_SYMBOL(fscache_mark_pages_cached); -- cgit v1.2.1 From 5f4f9f4af185d5e76c966d2d3420a61870c856e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:33 +0000 Subject: CacheFiles: Downgrade the requirements passed to the allocator Downgrade the requirements passed to the allocator in the gfp flags parameter. FS-Cache/CacheFiles can handle OOM conditions simply by aborting the attempt to store an object or a page in the cache. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 8 ++++---- fs/cachefiles/internal.h | 2 ++ fs/cachefiles/key.c | 2 +- fs/cachefiles/rdwr.c | 18 ++++++++++-------- fs/cachefiles/xattr.c | 2 +- fs/fscache/page.c | 2 +- 6 files changed, 19 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 67bef6d01484..9bff0f878cfd 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object( _enter("{%s},%p,", cache->cache.identifier, cookie); - lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); + lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); if (!lookup_data) goto nomem_lookup_data; /* create a new object record and a temporary leaf image */ - object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp); if (!object) goto nomem_object; @@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object( * - stick the length on the front and leave space on the back for the * encoder */ - buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); + buffer = kmalloc((2 + 512) + 3, cachefiles_gfp); if (!buffer) goto nomem_buffer; @@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object) return; } - auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); + auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); if (!auxdata) { _leave(" [nomem]"); return; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index bd6bc1bde2d7..49382519907a 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -23,6 +23,8 @@ extern unsigned cachefiles_debug; #define CACHEFILES_DEBUG_KLEAVE 2 #define CACHEFILES_DEBUG_KDEBUG 4 +#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) + /* * node records */ diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index 81b8b2b3a674..33b58c60f2d1 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) _debug("max: %d", max); - key = kmalloc(max, GFP_KERNEL); + key = kmalloc(max, cachefiles_gfp); if (!key) return NULL; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3367abdcdac4..9108b8ea505a 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -238,7 +238,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, _debug("read back %p{%lu,%d}", netpage, netpage->index, page_count(netpage)); - monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); if (!monitor) goto nomem; @@ -257,13 +257,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = page_cache_alloc_cold(bmapping); + newpage = __page_cache_alloc(cachefiles_gfp | + __GFP_COLD); if (!newpage) goto nomem_monitor; } ret = add_to_page_cache(newpage, bmapping, - netpage->index, GFP_KERNEL); + netpage->index, cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) @@ -481,7 +482,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage, netpage->index, page_count(netpage)); if (!monitor) { - monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + monitor = kzalloc(sizeof(*monitor), cachefiles_gfp); if (!monitor) goto nomem; @@ -496,13 +497,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = page_cache_alloc_cold(bmapping); + newpage = __page_cache_alloc(cachefiles_gfp | + __GFP_COLD); if (!newpage) goto nomem; } ret = add_to_page_cache(newpage, bmapping, - netpage->index, GFP_KERNEL); + netpage->index, cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) @@ -532,7 +534,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, _debug("- monitor add"); ret = add_to_page_cache(netpage, op->mapping, netpage->index, - GFP_KERNEL); + cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -608,7 +610,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, _debug("- uptodate"); ret = add_to_page_cache(netpage, op->mapping, netpage->index, - GFP_KERNEL); + cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index e18b183b47e1..73b46288b54b 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, ASSERT(dentry); ASSERT(dentry->d_inode); - auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); if (!auxbuf) { _leave(" = -ENOMEM"); return -ENOMEM; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d7c663cfc923..248a12e22532 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -759,7 +759,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_stores); - op = kzalloc(sizeof(*op), GFP_NOIO); + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); if (!op) goto nomem; -- cgit v1.2.1 From 0f972b5696c0a0677a9b3a18fee45cc0e8de4184 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:33 +0000 Subject: FS-Cache: Check that there are no read ops when cookie relinquished Check that the netfs isn't trying to relinquish a cookie that still has read operations in progress upon it. If there are, then give log a warning and BUG. Signed-off-by: David Howells --- fs/fscache/cookie.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 990535071a8a..0666996adf80 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -452,6 +452,14 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) _debug("RELEASE OBJ%x", object->debug_id); + if (atomic_read(&object->n_reads)) { + spin_unlock(&cookie->lock); + printk(KERN_ERR "FS-Cache:" + " Cookie '%s' still has %d outstanding reads\n", + cookie->def->name, atomic_read(&object->n_reads)); + BUG(); + } + /* detach each cache object from the object cookie */ spin_lock(&object->lock); hlist_del_init(&object->cookie_link); -- cgit v1.2.1 From 37491a1339df26259b06dfa33f30e574e9e52034 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:34 +0000 Subject: CacheFiles: Make some debugging statements conditional Downgrade some debugging statements to not unconditionally print stuff, but rather be conditional on the appropriate module parameter setting. Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 9108b8ea505a..bf123d9c3206 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, struct page *backpage = monitor->back_page, *backpage2; int ret; - kenter("{ino=%lx},{%lx,%lx}", + _enter("{ino=%lx},{%lx,%lx}", object->backer->d_inode->i_ino, backpage->index, backpage->flags); /* skip if the page was truncated away completely */ if (backpage->mapping != bmapping) { - kleave(" = -ENODATA [mapping]"); + _leave(" = -ENODATA [mapping]"); return -ENODATA; } backpage2 = find_get_page(bmapping, backpage->index); if (!backpage2) { - kleave(" = -ENODATA [gone]"); + _leave(" = -ENODATA [gone]"); return -ENODATA; } if (backpage != backpage2) { put_page(backpage2); - kleave(" = -ENODATA [different]"); + _leave(" = -ENODATA [different]"); return -ENODATA; } @@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, if (PageUptodate(backpage)) goto unlock_discard; - kdebug("reissue read"); + _debug("reissue read"); ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) goto unlock_discard; @@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, } /* it'll reappear on the todo list */ - kleave(" = -EINPROGRESS"); + _leave(" = -EINPROGRESS"); return -EINPROGRESS; unlock_discard: @@ -137,7 +137,7 @@ unlock_discard: spin_lock_irq(&object->work_lock); list_del(&monitor->op_link); spin_unlock_irq(&object->work_lock); - kleave(" = %d", ret); + _leave(" = %d", ret); return ret; } -- cgit v1.2.1 From ef46ed888efb1e8da33be5d33c9b54476289a43b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:35 +0000 Subject: FS-Cache: Make cookie relinquishment wait for outstanding reads Make fscache_relinquish_cookie() log a warning and wait if there are any outstanding reads left on the cookie it was given. Signed-off-by: David Howells --- fs/fscache/cookie.c | 18 ++++++++++++++---- fs/fscache/operation.c | 10 ++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0666996adf80..66be9eccede0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -442,22 +442,32 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; +try_again: spin_lock(&cookie->lock); /* break links with all the active objects */ while (!hlist_empty(&cookie->backing_objects)) { + int n_reads; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); _debug("RELEASE OBJ%x", object->debug_id); - if (atomic_read(&object->n_reads)) { + set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags); + n_reads = atomic_read(&object->n_reads); + if (n_reads) { + int n_ops = object->n_ops; + int n_in_progress = object->n_in_progress; spin_unlock(&cookie->lock); printk(KERN_ERR "FS-Cache:" - " Cookie '%s' still has %d outstanding reads\n", - cookie->def->name, atomic_read(&object->n_reads)); - BUG(); + " Cookie '%s' still has %d outstanding reads (%d,%d)\n", + cookie->def->name, + n_reads, n_ops, n_in_progress); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + printk("Wait finished\n"); + goto try_again; } /* detach each cache object from the object cookie */ diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 30afdfa7aec7..c857ab824d6e 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -340,8 +340,14 @@ void fscache_put_operation(struct fscache_operation *op) object = op->object; - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { + if (atomic_dec_and_test(&object->n_reads)) { + clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, + &object->cookie->flags); + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_WAITING_ON_READS); + } + } /* now... we may get called with the object spinlock held, so we * complete the cleanup here only if we can immediately acquire the -- cgit v1.2.1 From 9f10523f891928330b7529da54c1a3cc65180b1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:35 +0000 Subject: FS-Cache: Fix operation state management and accounting Fix the state management of internal fscache operations and the accounting of what operations are in what states. This is done by: (1) Give struct fscache_operation a enum variable that directly represents the state it's currently in, rather than spreading this knowledge over a bunch of flags, who's processing the operation at the moment and whether it is queued or not. This makes it easier to write assertions to check the state at various points and to prevent invalid state transitions. (2) Add an 'operation complete' state and supply a function to indicate the completion of an operation (fscache_op_complete()) and make things call it. The final call to fscache_put_operation() can then check that an op in the appropriate state (complete or cancelled). (3) Adjust the use of object->n_ops, ->n_in_progress, ->n_exclusive to better govern the state of an object: (a) The ->n_ops is now the number of extant operations on the object and is now decremented by fscache_put_operation() only. (b) The ->n_in_progress is simply the number of objects that have been taken off of the object's pending queue for the purposes of being run. This is decremented by fscache_op_complete() only. (c) The ->n_exclusive is the number of exclusive ops that have been submitted and queued or are in progress. It is decremented by fscache_op_complete() and by fscache_cancel_op(). fscache_put_operation() and fscache_operation_gc() now no longer try to clean up ->n_exclusive and ->n_in_progress. That was leading to double decrements against fscache_cancel_op(). fscache_cancel_op() now no longer decrements ->n_ops. That was leading to double decrements against fscache_put_operation(). fscache_submit_exclusive_op() now decides whether it has to queue an op based on ->n_in_progress being > 0 rather than ->n_ops > 0 as the latter will persist in being true even after all preceding operations have been cancelled or completed. Furthermore, if an object is active and there are runnable ops against it, there must be at least one op running. (4) Add a remaining-pages counter (n_pages) to struct fscache_retrieval and provide a function to record completion of the pages as they complete. When n_pages reaches 0, the operation is deemed to be complete and fscache_op_complete() is called. Add calls to fscache_retrieval_complete() anywhere we've finished with a page we've been given to read or allocate for. This includes places where we just return pages to the netfs for reading from the server and where accessing the cache fails and we discard the proposed netfs page. The bugs in the unfixed state management manifest themselves as oopses like the following where the operation completion gets out of sync with return of the cookie by the netfs. This is possible because the cache unlocks and returns all the netfs pages before recording its completion - which means that there's nothing to stop the netfs discarding them and returning the cookie. FS-Cache: Cookie 'NFS.fh' still has outstanding reads ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP CPU 1 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 400, comm: kswapd0 Not tainted 3.1.0-rc7-fsdevel+ #1090 /DG965RY RIP: 0010:[] [] __fscache_relinquish_cookie+0x170/0x343 [fscache] RSP: 0018:ffff8800368cfb00 EFLAGS: 00010282 RAX: 000000000000003c RBX: ffff880023cc8790 RCX: 0000000000000000 RDX: 0000000000002f2e RSI: 0000000000000001 RDI: ffffffff813ab86c RBP: ffff8800368cfb50 R08: 0000000000000002 R09: 0000000000000000 R10: ffff88003a1b7890 R11: ffff88001df6e488 R12: ffff880023d8ed98 R13: ffff880023cc8798 R14: 0000000000000004 R15: ffff88003b8bf370 FS: 0000000000000000(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000008ba008 CR3: 0000000023d93000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kswapd0 (pid: 400, threadinfo ffff8800368ce000, task ffff88003b8bf040) Stack: ffff88003b8bf040 ffff88001df6e528 ffff88001df6e528 ffffffffa00b46b0 ffff88003b8bf040 ffff88001df6e488 ffff88001df6e620 ffffffffa00b46b0 ffff88001ebd04c8 0000000000000004 ffff8800368cfb70 ffffffffa00b2c91 Call Trace: [] nfs_fscache_release_inode_cookie+0x3b/0x47 [nfs] [] nfs_clear_inode+0x3c/0x41 [nfs] [] nfs4_evict_inode+0x2f/0x33 [nfs] [] evict+0xa1/0x15c [] dispose_list+0x2c/0x38 [] prune_icache_sb+0x28c/0x29b [] prune_super+0xd5/0x140 [] shrink_slab+0x102/0x1ab [] balance_pgdat+0x2f2/0x595 [] ? process_timeout+0xb/0xb [] kswapd+0x270/0x289 [] ? __init_waitqueue_head+0x46/0x46 [] ? balance_pgdat+0x595/0x595 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? finish_task_switch+0x45/0xc0 [] ? retint_restore_args+0xe/0xe [] ? __init_kthread_worker+0x53/0x53 [] ? gs_change+0xb/0xb Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 31 ++++++++++++++--- fs/fscache/object.c | 2 -- fs/fscache/operation.c | 91 ++++++++++++++++++++++++++++++++------------------ fs/fscache/page.c | 25 +++++++++++--- 4 files changed, 106 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index bf123d9c3206..93a0815e0498 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -197,6 +197,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) fscache_end_io(op, monitor->netfs_page, error); page_cache_release(monitor->netfs_page); + fscache_retrieval_complete(op, 1); fscache_put_retrieval(op); kfree(monitor); @@ -339,6 +340,7 @@ backing_page_already_uptodate: copy_highpage(netpage, backpage); fscache_end_io(op, netpage, 0); + fscache_retrieval_complete(op, 1); success: _debug("success"); @@ -360,6 +362,7 @@ read_error: goto out; io_error: cachefiles_io_error_obj(object, "Page read error on backing file"); + fscache_retrieval_complete(op, 1); ret = -ENOBUFS; goto out; @@ -369,6 +372,7 @@ nomem_monitor: fscache_put_retrieval(monitor->op); kfree(monitor); nomem: + fscache_retrieval_complete(op, 1); _leave(" = -ENOMEM"); return -ENOMEM; } @@ -407,7 +411,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, _enter("{%p},{%lx},,,", object, page->index); if (!object->backer) - return -ENOBUFS; + goto enobufs; inode = object->backer->d_inode; ASSERT(S_ISREG(inode->i_mode)); @@ -416,7 +420,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, /* calculate the shift required to use bmap */ if (inode->i_sb->s_blocksize > PAGE_SIZE) - return -ENOBUFS; + goto enobufs; shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -448,13 +452,19 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ fscache_mark_page_cached(op, page); + fscache_retrieval_complete(op, 1); ret = -ENODATA; } else { - ret = -ENOBUFS; + goto enobufs; } _leave(" = %d", ret); return ret; + +enobufs: + fscache_retrieval_complete(op, 1); + _leave(" = -ENOBUFS"); + return -ENOBUFS; } /* @@ -632,6 +642,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); + fscache_retrieval_complete(op, 1); page_cache_release(netpage); netpage = NULL; continue; @@ -659,6 +670,7 @@ out: list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); page_cache_release(netpage); + fscache_retrieval_complete(op, 1); } _leave(" = %d", ret); @@ -707,7 +719,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, *nr_pages); if (!object->backer) - return -ENOBUFS; + goto all_enobufs; space = 1; if (cachefiles_has_space(cache, 0, *nr_pages) < 0) @@ -720,7 +732,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, /* calculate the shift required to use bmap */ if (inode->i_sb->s_blocksize > PAGE_SIZE) - return -ENOBUFS; + goto all_enobufs; shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -760,7 +772,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, nrbackpages++; } else if (space && pagevec_add(&pagevec, page) == 0) { fscache_mark_pages_cached(op, &pagevec); + fscache_retrieval_complete(op, 1); ret = -ENODATA; + } else { + fscache_retrieval_complete(op, 1); } } @@ -781,6 +796,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, _leave(" = %d [nr=%u%s]", ret, *nr_pages, list_empty(pages) ? " empty" : ""); return ret; + +all_enobufs: + fscache_retrieval_complete(op, *nr_pages); + return -ENOBUFS; } /* @@ -815,6 +834,7 @@ int cachefiles_allocate_page(struct fscache_retrieval *op, else ret = -ENOBUFS; + fscache_retrieval_complete(op, 1); _leave(" = %d", ret); return ret; } @@ -864,6 +884,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op, ret = -ENOBUFS; } + fscache_retrieval_complete(op, *nr_pages); _leave(" = %d", ret); return ret; } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index b6b897c550ac..773bc798a416 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -587,8 +587,6 @@ static void fscache_object_available(struct fscache_object *object) if (object->n_in_progress == 0) { if (object->n_ops > 0) { ASSERTCMP(object->n_ops, >=, object->n_obj_ops); - ASSERTIF(object->n_ops > object->n_obj_ops, - !list_empty(&object->pending_ops)); fscache_start_operations(object); } else { ASSERT(list_empty(&object->pending_ops)); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index c857ab824d6e..748f9553c2cb 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); ASSERTCMP(atomic_read(&op->usage), >, 0); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { @@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation); static void fscache_run_op(struct fscache_object *object, struct fscache_operation *op) { + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + + op->state = FSCACHE_OP_ST_IN_PROGRESS; object->n_in_progress++; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -80,22 +84,23 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { - int ret; - _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); + ASSERTCMP(atomic_read(&op->usage), >, 0); + spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); - ret = -ENOBUFS; + op->state = FSCACHE_OP_ST_PENDING; if (fscache_object_is_active(object)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ - if (object->n_ops > 1) { + if (object->n_in_progress > 0) { atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); @@ -111,7 +116,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); - ret = 0; } else if (object->state == FSCACHE_OBJECT_CREATING) { op->object = object; object->n_ops++; @@ -119,14 +123,13 @@ int fscache_submit_exclusive_op(struct fscache_object *object, atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); - ret = 0; } else { /* not allowed to submit ops in any other state */ BUG(); } spin_unlock(&object->lock); - return ret; + return 0; } /* @@ -186,6 +189,7 @@ int fscache_submit_op(struct fscache_object *object, _enter("{OBJ%x OP%x},{%u}", object->debug_id, op->debug_id, atomic_read(&op->usage)); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); ASSERTCMP(atomic_read(&op->usage), >, 0); spin_lock(&object->lock); @@ -196,6 +200,7 @@ int fscache_submit_op(struct fscache_object *object, ostate = object->state; smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; if (fscache_object_is_active(object)) { op->object = object; object->n_ops++; @@ -225,12 +230,15 @@ int fscache_submit_op(struct fscache_object *object, object->state == FSCACHE_OBJECT_LC_DYING || object->state == FSCACHE_OBJECT_WITHDRAWING) { fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); + op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else { + op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -290,13 +298,18 @@ int fscache_cancel_op(struct fscache_operation *op) _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); + ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); + ASSERTCMP(atomic_read(&op->usage), >, 0); + spin_lock(&object->lock); ret = -EBUSY; - if (!list_empty(&op->pend_link)) { + if (op->state == FSCACHE_OP_ST_PENDING) { + ASSERT(!list_empty(&op->pend_link)); fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); - object->n_ops--; + op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) @@ -310,6 +323,37 @@ int fscache_cancel_op(struct fscache_operation *op) return ret; } +/* + * Record the completion of an in-progress operation. + */ +void fscache_op_complete(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + + _enter("OBJ%x", object->debug_id); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); + ASSERTCMP(object->n_in_progress, >, 0); + ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), + object->n_exclusive, >, 0); + ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags), + object->n_in_progress, ==, 1); + + spin_lock(&object->lock); + + op->state = FSCACHE_OP_ST_COMPLETE; + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + spin_unlock(&object->lock); + _leave(""); +} +EXPORT_SYMBOL(fscache_op_complete); + /* * release an operation * - queues pending ops if this is the last in-progress op @@ -328,8 +372,9 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) - BUG(); + ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + op->state, ==, FSCACHE_OP_ST_CANCELLED); + op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -365,16 +410,6 @@ void fscache_put_operation(struct fscache_operation *op) return; } - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { - ASSERTCMP(object->n_exclusive, >, 0); - object->n_exclusive--; - } - - ASSERTCMP(object->n_in_progress, >, 0); - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); - ASSERTCMP(object->n_ops, >, 0); object->n_ops--; if (object->n_ops == 0) @@ -413,23 +448,14 @@ void fscache_operation_gc(struct work_struct *work) spin_unlock(&cache->op_gc_list_lock); object = op->object; + spin_lock(&object->lock); _debug("GC DEFERRED REL OBJ%x OP%x", object->debug_id, op->debug_id); fscache_stat(&fscache_n_op_gc); ASSERTCMP(atomic_read(&op->usage), ==, 0); - - spin_lock(&object->lock); - if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { - ASSERTCMP(object->n_exclusive, >, 0); - object->n_exclusive--; - } - - ASSERTCMP(object->n_in_progress, >, 0); - object->n_in_progress--; - if (object->n_in_progress == 0) - fscache_start_operations(object); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD); ASSERTCMP(object->n_ops, >, 0); object->n_ops--; @@ -437,6 +463,7 @@ void fscache_operation_gc(struct work_struct *work) fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); spin_unlock(&object->lock); + kfree(op); } while (count++ < 20); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 248a12e22532..b38b13d2a555 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -162,6 +162,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_abort_object(object); } + fscache_op_complete(op); _leave(""); } @@ -223,6 +224,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); + ASSERTCMP(op->n_pages, ==, 0); + fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) fscache_put_context(op->op.object->cookie, op->context); @@ -320,6 +323,11 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object, _debug("<<< GO"); check_if_dead: + if (op->op.state == FSCACHE_OP_ST_CANCELLED) { + fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [cancelled]"); + return -ENOBUFS; + } if (unlikely(fscache_object_is_dead(object))) { fscache_stat(stat_object_dead); return -ENOBUFS; @@ -364,6 +372,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, _leave(" = -ENOMEM"); return -ENOMEM; } + op->n_pages = 1; spin_lock(&cookie->lock); @@ -375,10 +384,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); atomic_inc(&object->n_reads); - set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock; + goto nobufs_unlock_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_retrieval_ops); @@ -425,6 +434,8 @@ error: _leave(" = %d", ret); return ret; +nobufs_unlock_dec: + atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); kfree(op); @@ -482,6 +493,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(mapping, end_io_func, context); if (!op) return -ENOMEM; + op->n_pages = *nr_pages; spin_lock(&cookie->lock); @@ -491,10 +503,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, struct fscache_object, cookie_link); atomic_inc(&object->n_reads); - set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock; + goto nobufs_unlock_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_retrieval_ops); @@ -541,6 +553,8 @@ error: _leave(" = %d", ret); return ret; +nobufs_unlock_dec: + atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); kfree(op); @@ -583,6 +597,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(page->mapping, NULL, NULL); if (!op) return -ENOMEM; + op->n_pages = 1; spin_lock(&cookie->lock); @@ -696,6 +711,7 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_end_page_write(object, page); if (ret < 0) { fscache_abort_object(object); + fscache_op_complete(&op->op); } else { fscache_enqueue_operation(&op->op); } @@ -710,6 +726,7 @@ superseded: spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); + fscache_op_complete(&op->op); _leave(""); } -- cgit v1.2.1 From ef778e7ae67cd426c30cad43378b908f5eb0bad5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:36 +0000 Subject: FS-Cache: Provide proper invalidation Provide a proper invalidation method rather than relying on the netfs retiring the cookie it has and getting a new one. The problem with this is that isn't easy for the netfs to make sure that it has completed/cancelled all its outstanding storage and retrieval operations on the cookie it is retiring. Instead, have the cache provide an invalidation method that will cancel or wait for all currently outstanding operations before invalidating the cache, and will cause new operations to queue up behind that. Whilst invalidation is in progress, some requests will be rejected until the cache can stack a barrier on the operation queue to cause new operations to be deferred behind it. Signed-off-by: David Howells --- fs/fscache/cookie.c | 60 +++++++++++++++++++++++++++++++++++++++++ fs/fscache/internal.h | 10 +++++++ fs/fscache/object.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fscache/operation.c | 32 ++++++++++++++++++++++ fs/fscache/page.c | 51 +++++++++++++++++++++++++++++++++++ fs/fscache/stats.c | 11 +++++++- 6 files changed, 235 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 66be9eccede0..8dcb114758e3 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -369,6 +369,66 @@ cant_attach_object: return ret; } +/* + * Invalidate an object. Callable with spinlocks held. + */ +void __fscache_invalidate(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + + _enter("{%s}", cookie->def->name); + + fscache_stat(&fscache_n_invalidates); + + /* Only permit invalidation of data files. Invalidating an index will + * require the caller to release all its attachments to the tree rooted + * there, and if it's doing that, it may as well just retire the + * cookie. + */ + ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + + /* We will be updating the cookie too. */ + BUG_ON(!cookie->def->get_aux); + + /* If there's an object, we tell the object state machine to handle the + * invalidation on our behalf, otherwise there's nothing to do. + */ + if (!hlist_empty(&cookie->backing_objects)) { + spin_lock(&cookie->lock); + + if (!hlist_empty(&cookie->backing_objects) && + !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, + &cookie->flags)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, + cookie_link); + if (object->state < FSCACHE_OBJECT_DYING) + fscache_raise_event( + object, FSCACHE_OBJECT_EV_INVALIDATE); + } + + spin_unlock(&cookie->lock); + } + + _leave(""); +} +EXPORT_SYMBOL(__fscache_invalidate); + +/* + * Wait for object invalidation to complete. + */ +void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) +{ + _enter("%p", cookie); + + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, + fscache_wait_bit_interruptible, + TASK_UNINTERRUPTIBLE); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_wait_on_invalidate); + /* * update the index entries backing a cookie */ diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f6aad48d38a8..c81179303930 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -122,10 +122,16 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); extern int fscache_cancel_op(struct fscache_operation *); +extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); extern void fscache_operation_gc(struct work_struct *); +/* + * page.c + */ +extern void fscache_invalidate_writes(struct fscache_cookie *); + /* * proc.c */ @@ -205,6 +211,9 @@ extern atomic_t fscache_n_acquires_ok; extern atomic_t fscache_n_acquires_nobufs; extern atomic_t fscache_n_acquires_oom; +extern atomic_t fscache_n_invalidates; +extern atomic_t fscache_n_invalidates_run; + extern atomic_t fscache_n_updates; extern atomic_t fscache_n_updates_null; extern atomic_t fscache_n_updates_run; @@ -237,6 +246,7 @@ extern atomic_t fscache_n_cop_alloc_object; extern atomic_t fscache_n_cop_lookup_object; extern atomic_t fscache_n_cop_lookup_complete; extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_invalidate_object; extern atomic_t fscache_n_cop_update_object; extern atomic_t fscache_n_cop_drop_object; extern atomic_t fscache_n_cop_put_object; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 773bc798a416..80b549141ea6 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,6 +14,7 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include +#include #include "internal.h" const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { @@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", + [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING", [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", @@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_CREATING] = "CRTN", [FSCACHE_OBJECT_AVAILABLE] = "AVBL", [FSCACHE_OBJECT_ACTIVE] = "ACTV", + [FSCACHE_OBJECT_INVALIDATING] = "INVL", [FSCACHE_OBJECT_UPDATING] = "UPDT", [FSCACHE_OBJECT_DYING] = "DYNG", [FSCACHE_OBJECT_LC_DYING] = "LCDY", @@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *); static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); +static void fscache_invalidate_object(struct fscache_object *); static void fscache_release_object(struct fscache_object *); static void fscache_withdraw_object(struct fscache_object *); static void fscache_enqueue_dependents(struct fscache_object *); @@ -78,6 +82,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object) spin_unlock(&parent->lock); } +/* + * Notify netfs of invalidation completion. + */ +static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) +{ + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); +} + /* * process events that have been sent to an object's state machine * - initiates parent lookup @@ -125,6 +138,16 @@ static void fscache_object_state_machine(struct fscache_object *object) case FSCACHE_OBJECT_ACTIVE: goto active_transit; + /* Invalidate an object on disk */ + case FSCACHE_OBJECT_INVALIDATING: + clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events); + fscache_stat(&fscache_n_invalidates_run); + fscache_stat(&fscache_n_cop_invalidate_object); + fscache_invalidate_object(object); + fscache_stat_d(&fscache_n_cop_invalidate_object); + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + goto active_transit; + /* update the object metadata on disk */ case FSCACHE_OBJECT_UPDATING: clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); @@ -275,6 +298,9 @@ active_transit: case FSCACHE_OBJECT_EV_ERROR: new_state = FSCACHE_OBJECT_DYING; goto change_state; + case FSCACHE_OBJECT_EV_INVALIDATE: + new_state = FSCACHE_OBJECT_INVALIDATING; + goto change_state; case FSCACHE_OBJECT_EV_UPDATE: new_state = FSCACHE_OBJECT_UPDATING; goto change_state; @@ -679,6 +705,7 @@ static void fscache_withdraw_object(struct fscache_object *object) if (object->cookie == cookie) { hlist_del_init(&object->cookie_link); object->cookie = NULL; + fscache_invalidation_complete(cookie); detached = true; } spin_unlock(&cookie->lock); @@ -888,3 +915,48 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, return result; } EXPORT_SYMBOL(fscache_check_aux); + +/* + * Asynchronously invalidate an object. + */ +static void fscache_invalidate_object(struct fscache_object *object) +{ + struct fscache_operation *op; + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x}", object->debug_id); + + /* Reject any new read/write ops and abort any that are pending. */ + fscache_invalidate_writes(cookie); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + + /* Now we have to wait for in-progress reads and writes */ + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); + _leave(" [ENOMEM]"); + return; + } + + fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + + spin_lock(&cookie->lock); + if (fscache_submit_exclusive_op(object, op) < 0) + BUG(); + spin_unlock(&cookie->lock); + fscache_put_operation(op); + + /* Once we've completed the invalidation, we know there will be no data + * stored in the cache and thus we can reinstate the data-check-skip + * optimisation. + */ + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* We can allow read and write requests to come in once again. They'll + * queue up behind our exclusive invalidation operation. + */ + fscache_invalidation_complete(cookie); + _leave(""); +} diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 748f9553c2cb..c58dbe613266 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -323,6 +323,38 @@ int fscache_cancel_op(struct fscache_operation *op) return ret; } +/* + * Cancel all pending operations on an object + */ +void fscache_cancel_all_ops(struct fscache_object *object) +{ + struct fscache_operation *op; + + _enter("OBJ%x", object->debug_id); + + spin_lock(&object->lock); + + while (!list_empty(&object->pending_ops)) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->state = FSCACHE_OP_ST_CANCELLED; + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + cond_resched_lock(&object->lock); + } + + spin_unlock(&object->lock); + _leave(""); +} + /* * Record the completion of an in-progress operation. */ diff --git a/fs/fscache/page.c b/fs/fscache/page.c index b38b13d2a555..7bf9d2557052 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -361,6 +361,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, if (hlist_empty(&cookie->backing_objects)) goto nobufs; + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); ASSERTCMP(page, !=, NULL); @@ -483,6 +488,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (hlist_empty(&cookie->backing_objects)) goto nobufs; + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); ASSERTCMP(*nr_pages, >, 0); ASSERT(!list_empty(pages)); @@ -591,6 +601,11 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); ASSERTCMP(page, !=, NULL); + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; @@ -730,6 +745,37 @@ superseded: _leave(""); } +/* + * Clear the pages pending writing for invalidation + */ +void fscache_invalidate_writes(struct fscache_cookie *cookie) +{ + struct page *page; + void *results[16]; + int n, i; + + _enter(""); + + while (spin_lock(&cookie->stores_lock), + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, + ARRAY_SIZE(results), + FSCACHE_COOKIE_PENDING_TAG), + n > 0) { + for (i = n - 1; i >= 0; i--) { + page = results[i]; + radix_tree_delete(&cookie->stores, page->index); + } + + spin_unlock(&cookie->stores_lock); + + for (i = n - 1; i >= 0; i--) + page_cache_release(results[i]); + } + + spin_unlock(&cookie->stores_lock); + _leave(""); +} + /* * request a page be stored in the cache * - returns: @@ -776,6 +822,11 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_stores); + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); if (!op) goto nomem; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 4765190d537f..51cdaee14109 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -80,6 +80,9 @@ atomic_t fscache_n_acquires_ok; atomic_t fscache_n_acquires_nobufs; atomic_t fscache_n_acquires_oom; +atomic_t fscache_n_invalidates; +atomic_t fscache_n_invalidates_run; + atomic_t fscache_n_updates; atomic_t fscache_n_updates_null; atomic_t fscache_n_updates_run; @@ -112,6 +115,7 @@ atomic_t fscache_n_cop_alloc_object; atomic_t fscache_n_cop_lookup_object; atomic_t fscache_n_cop_lookup_complete; atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_invalidate_object; atomic_t fscache_n_cop_update_object; atomic_t fscache_n_cop_drop_object; atomic_t fscache_n_cop_put_object; @@ -168,6 +172,10 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_object_created), atomic_read(&fscache_n_object_lookups_timed_out)); + seq_printf(m, "Invals : n=%u run=%u\n", + atomic_read(&fscache_n_invalidates), + atomic_read(&fscache_n_invalidates_run)); + seq_printf(m, "Updates: n=%u nul=%u run=%u\n", atomic_read(&fscache_n_updates), atomic_read(&fscache_n_updates_null), @@ -246,7 +254,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_lookup_object), atomic_read(&fscache_n_cop_lookup_complete), atomic_read(&fscache_n_cop_grab_object)); - seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", + seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n", + atomic_read(&fscache_n_cop_invalidate_object), atomic_read(&fscache_n_cop_update_object), atomic_read(&fscache_n_cop_drop_object), atomic_read(&fscache_n_cop_put_object), -- cgit v1.2.1 From a02de9608595c8ef649ef03ae735b0b45e3d4396 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:36 +0000 Subject: VFS: Make more complete truncate operation available to CacheFiles Make a more complete truncate operation available to CacheFiles (including security checks and suchlike) so that it can use this to clear invalidated cache files. Signed-off-by: David Howells Acked-by: Al Viro --- fs/open.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 182d8667b7bd..c819bbdab47f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -61,33 +61,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, return ret; } -static long do_sys_truncate(const char __user *pathname, loff_t length) +long vfs_truncate(struct path *path, loff_t length) { - struct path path; struct inode *inode; - int error; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; + long error; - error = user_path(pathname, &path); - if (error) - goto out; - inode = path.dentry->d_inode; + inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ - error = -EISDIR; if (S_ISDIR(inode->i_mode)) - goto dput_and_out; - - error = -EINVAL; + return -EISDIR; if (!S_ISREG(inode->i_mode)) - goto dput_and_out; + return -EINVAL; - error = mnt_want_write(path.mnt); + error = mnt_want_write(path->mnt); if (error) - goto dput_and_out; + goto out; error = inode_permission(inode, MAY_WRITE); if (error) @@ -111,19 +100,34 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) - error = security_path_truncate(&path); + error = security_path_truncate(path); if (!error) - error = do_truncate(path.dentry, length, 0, NULL); + error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); + mnt_drop_write(path->mnt); out: return error; } +EXPORT_SYMBOL_GPL(vfs_truncate); + +static long do_sys_truncate(const char __user *pathname, loff_t length) +{ + struct path path; + int error; + + if (length < 0) /* sorry, but loff_t says... */ + return -EINVAL; + + error = user_path(pathname, &path); + if (!error) { + error = vfs_truncate(&path, length); + path_put(&path); + } + return error; +} SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { -- cgit v1.2.1 From 9dc8d9bfe4415efb61a5e9390706b8a3bffef329 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:36 +0000 Subject: CacheFiles: Implement invalidation Implement invalidation for CacheFiles. This is in two parts: (1) Provide an invalidation method (which just truncates the backing file). (2) Abort attempts to copy anything read from the backing file whilst invalidation is in progress. Question: CacheFiles uses truncation in a couple of places. It has been using notify_change() rather than sys_truncate() or something similar. This means it bypasses a bunch of checks and suchlike that it possibly should be making (security, file locking, lease breaking, vfsmount write). Should it be using vfs_truncate() as added by a preceding patch or should it use notify_write() and assume that anyone poking around in the cache files on disk gets everything they deserve? Signed-off-by: David Howells --- fs/cachefiles/interface.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++ fs/cachefiles/rdwr.c | 5 ++++- 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 9bff0f878cfd..7a9d574b961c 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -440,6 +440,54 @@ truncate_failed: return ret; } +/* + * Invalidate an object + */ +static void cachefiles_invalidate_object(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct path path; + uint64_t ni_size; + int ret; + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + op->object->cookie->def->get_attr(op->object->cookie->netfs_data, + &ni_size); + + _enter("{OBJ%x},[%llu]", + op->object->debug_id, (unsigned long long)ni_size); + + if (object->backer) { + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + fscache_set_store_limit(&object->fscache, ni_size); + + path.dentry = object->backer; + path.mnt = cache->mnt; + + cachefiles_begin_secure(cache, &saved_cred); + ret = vfs_truncate(&path, 0); + if (ret == 0) + ret = vfs_truncate(&path, ni_size); + cachefiles_end_secure(cache, saved_cred); + + if (ret != 0) { + fscache_set_store_limit(&object->fscache, 0); + if (ret == -EIO) + cachefiles_io_error_obj(object, + "Invalidate failed"); + } + } + + fscache_op_complete(op); + _leave(""); +} + /* * dissociate a cache from all the pages it was backing */ @@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .lookup_complete = cachefiles_lookup_complete, .grab_object = cachefiles_grab_object, .update_object = cachefiles_update_object, + .invalidate_object = cachefiles_invalidate_object, .drop_object = cachefiles_drop_object, .put_object = cachefiles_put_object, .sync_cache = cachefiles_sync_cache, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 93a0815e0498..2c994885520a 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -174,7 +174,10 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _debug("- copy {%lu}", monitor->back_page->index); recheck: - if (PageUptodate(monitor->back_page)) { + if (test_bit(FSCACHE_COOKIE_INVALIDATING, + &object->fscache.cookie->flags)) { + error = -ESTALE; + } else if (PageUptodate(monitor->back_page)) { copy_highpage(monitor->netfs_page, monitor->back_page); fscache_mark_page_cached(monitor->op, monitor->netfs_page); -- cgit v1.2.1 From de242c0b8b365a9e348bf53143e18e9d8c9cfae8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 21:52:38 +0000 Subject: NFS: Use FS-Cache invalidation Use the new FS-Cache invalidation facility from NFS to deal with foreign changes being detected on the server rather than attempting to retire the old cookie and get a new one. The problem with the old method was that NFS did not wait for all outstanding storage and retrieval ops on the cache to complete. There was no automatic wait between the calls to ->readpages() and calls to invalidate_inode_pages2() as the latter can only wait on locked pages that have been added to the pagecache (which they haven't yet on entry to ->readpages()). This was leading to oopses like the one below when an outstanding read got cut off from its cookie by a premature release. BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: [] __fscache_read_or_alloc_pages+0x1dd/0x315 [fscache] PGD 15889067 PUD 15890067 PMD 0 Oops: 0000 [#1] SMP CPU 0 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 4544, comm: tar Not tainted 3.1.0-rc4-fsdevel+ #1064 /DG965RY RIP: 0010:[] [] __fscache_read_or_alloc_pages+0x1dd/0x315 [fscache] RSP: 0018:ffff8800158799e8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8800070d41e0 RCX: ffff8800083dc1b0 RDX: 0000000000000000 RSI: ffff880015879960 RDI: ffff88003e627b90 RBP: ffff880015879a28 R08: 0000000000000002 R09: 0000000000000002 R10: 0000000000000001 R11: ffff880015879950 R12: ffff880015879aa4 R13: 0000000000000000 R14: ffff8800083dc158 R15: ffff880015879be8 FS: 00007f671e9d87c0(0000) GS:ffff88003bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000000000a8 CR3: 000000001587f000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process tar (pid: 4544, threadinfo ffff880015878000, task ffff880015875040) Stack: ffffffffa00b1759 ffff8800070dc158 ffff8800000213da ffff88002a286508 ffff880015879aa4 ffff880015879be8 0000000000000001 ffff88002a2866e8 ffff880015879a88 ffffffffa00b20be 00000000000200da ffff880015875040 Call Trace: [] ? nfs_fscache_wait_bit+0xd/0xd [nfs] [] __nfs_readpages_from_fscache+0x7e/0x13f [nfs] [] ? __alloc_pages_nodemask+0x156/0x662 [] nfs_readpages+0xee/0x187 [nfs] [] __do_page_cache_readahead+0x1be/0x267 [] ? __do_page_cache_readahead+0xa2/0x267 [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x28b/0x29a [] page_cache_sync_readahead+0x38/0x3a [] generic_file_aio_read+0x2ab/0x67e [] nfs_file_read+0xa4/0xc9 [nfs] [] do_sync_read+0xba/0xfa [] ? might_fault+0x4e/0x9e [] ? security_file_permission+0x7b/0x84 [] ? rw_verify_area+0xab/0xc8 [] vfs_read+0xaa/0x13a [] sys_read+0x45/0x6c [] system_call_fastpath+0x16/0x1b Reported-by: Mark Moseley Signed-off-by: David Howells --- fs/nfs/fscache.h | 20 +++++++++++++++++++- fs/nfs/inode.c | 20 ++++++++++++++++---- fs/nfs/nfs4proc.c | 3 ++- 3 files changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index c5b11b53ff33..277b02782897 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -152,6 +152,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode, __nfs_readpage_to_fscache(inode, page, sync); } +/* + * Invalidate the contents of fscache for this inode. This will not sleep. + */ +static inline void nfs_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(NFS_I(inode)->fscache); +} + +/* + * Wait for an object to finish being invalidated. + */ +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +{ + fscache_wait_on_invalidate(NFS_I(inode)->fscache); +} + /* * indicate the client caching state as readable text */ @@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server) return "no "; } - #else /* CONFIG_NFS_FSCACHE */ static inline int nfs_fscache_register(void) { return 0; } static inline void nfs_fscache_unregister(void) {} @@ -205,6 +220,9 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, static inline void nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) {} + +static inline void nfs_fscache_invalidate(struct inode *inode) {} + static inline const char *nfs_server_fscache_state(struct nfs_server *server) { return "no "; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2faae14d89f4..ebeb94ce1b0b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo_timestamp = jiffies; memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; - else + nfs_fscache_invalidate(inode); + } else { nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + } } void nfs_zap_caches(struct inode *inode) @@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + nfs_fscache_invalidate(inode); spin_unlock(&inode->i_lock); } } @@ -881,7 +884,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); spin_unlock(&inode->i_lock); nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - nfs_fscache_reset_inode_cookie(inode); + nfs_fscache_wait_on_invalidate(inode); dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); return 0; @@ -957,6 +960,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr i_size_write(inode, nfs_size_to_loff_t(fattr->size)); ret |= NFS_INO_INVALID_ATTR; } + + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); + return ret; } @@ -1205,8 +1212,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr struct nfs_inode *nfsi = NFS_I(inode); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_fscache_invalidate(inode); + } if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); @@ -1494,6 +1503,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; + if (invalid & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); + return 0; out_err: /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 493f0f41c554..5d864fb36578 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -64,7 +64,7 @@ #include "pnfs.h" #include "netns.h" #include "nfs4session.h" - +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -734,6 +734,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; + nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } -- cgit v1.2.1 From b4cf1e08c8ac95eff65faa53904f7f13ac78194b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:45 +0000 Subject: CacheFiles: Add missing retrieval completions CacheFiles is missing some calls to fscache_retrieval_complete() in the error handling/collision paths of its reader functions. This can be seen by the following assertion tripping in fscache_put_operation() whereby the operation being destroyed is still in the in-progress state and has not been cancelled or completed: FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:408! invalid opcode: 0000 [#1] SMP CPU 2 Modules linked in: xfs ioatdma dca loop joydev evdev psmouse dcdbas pcspkr serio_raw i5000_edac edac_core i5k_amb shpchp pci_hotplug sg sr_mod] Pid: 8062, comm: httpd Not tainted 3.1.0-rc8 #1 Dell Inc. PowerEdge 1950/0DT097 RIP: 0010:[] [] fscache_put_operation+0x304/0x330 RSP: 0018:ffff880062f739d8 EFLAGS: 00010296 RAX: 0000000000000025 RBX: ffff8800c5122e84 RCX: ffffffff81ddf040 RDX: 00000000ffffffff RSI: 0000000000000082 RDI: ffffffff81ddef30 RBP: ffff880062f739f8 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000003 R12: ffff8800c5122e40 R13: ffff880037a2cd20 R14: ffff880087c7a058 R15: ffff880087c7a000 FS: 00007f63dcf636e0(0000) GS:ffff88022fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0c0a91f000 CR3: 0000000062ec2000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process httpd (pid: 8062, threadinfo ffff880062f72000, task ffff880087e58000) Stack: ffff880062f73bf8 0000000000000000 ffff880062f73bf8 ffff880037a2cd20 ffff880062f73a68 ffffffff8119aa7e ffff88006540e000 ffff880062f73ad4 ffff88008e9a4308 ffff880037a2cd20 ffff880062f73a48 ffff8800c5122e40 Call Trace: [] __fscache_read_or_alloc_pages+0x1fe/0x530 [] __nfs_readpages_from_fscache+0x70/0x1c0 [] nfs_readpages+0xca/0x1e0 [] ? rpc_do_put_task+0x36/0x50 [] ? alloc_nfs_open_context+0x4b/0x110 [] ? rpc_call_sync+0x5a/0x70 [] __do_page_cache_readahead+0x1ca/0x270 [] ra_submit+0x21/0x30 [] ondemand_readahead+0x11d/0x250 [] page_cache_sync_readahead+0x36/0x60 [] generic_file_aio_read+0x454/0x770 [] nfs_file_read+0xe1/0x130 [] do_sync_read+0xd9/0x120 [] ? mntput+0x1f/0x40 [] ? fput+0x1cb/0x260 [] vfs_read+0xc8/0x180 [] sys_read+0x55/0x90 Reported-by: Mark Moseley Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 14 ++++++++++---- fs/fscache/page.c | 2 ++ 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 2c994885520a..480992259707 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -361,8 +361,10 @@ out: read_error: _debug("read error %d", ret); - if (ret == -ENOMEM) + if (ret == -ENOMEM) { + fscache_retrieval_complete(op, 1); goto out; + } io_error: cachefiles_io_error_obj(object, "Page read error on backing file"); fscache_retrieval_complete(op, 1); @@ -551,6 +553,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); + fscache_retrieval_complete(op, 1); continue; } goto nomem; @@ -627,6 +630,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); + fscache_retrieval_complete(op, 1); continue; } goto nomem; @@ -645,9 +649,9 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); - fscache_retrieval_complete(op, 1); page_cache_release(netpage); netpage = NULL; + fscache_retrieval_complete(op, 1); continue; } @@ -682,15 +686,17 @@ out: nomem: _debug("nomem"); ret = -ENOMEM; - goto out; + goto record_page_complete; read_error: _debug("read error %d", ret); if (ret == -ENOMEM) - goto out; + goto record_page_complete; io_error: cachefiles_io_error_obj(object, "Page read error on backing file"); ret = -ENOBUFS; +record_page_complete: + fscache_retrieval_complete(op, 1); goto out; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7bf9d2557052..4dbbca162620 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -329,6 +329,8 @@ check_if_dead: return -ENOBUFS; } if (unlikely(fscache_object_is_dead(object))) { + pr_err("%s() = -ENOBUFS [obj dead %d]", __func__, op->op.state); + fscache_cancel_op(&op->op); fscache_stat(stat_object_dead); return -ENOBUFS; } -- cgit v1.2.1 From 03acc4be5e479eebc95338cd1d72a9954c128e2b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:46 +0000 Subject: FS-Cache: Initialise the object event mask with the calculated mask Initialise the object event mask with the calculated mask rather than unmasking undefined events also. Signed-off-by: David Howells --- fs/fscache/object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 80b549141ea6..2ef8a082a272 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -114,7 +114,8 @@ static void fscache_object_state_machine(struct fscache_object *object) /* wait for the parent object to become ready */ case FSCACHE_OBJECT_INIT: object->event_mask = - ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + FSCACHE_OBJECT_EVENTS_MASK & + ~(1 << FSCACHE_OBJECT_EV_CLEARED); fscache_initialise_object(object); goto done; -- cgit v1.2.1 From c2d35bfe4b508451b75b5b6bc60a08dbdc44f952 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:47 +0000 Subject: FS-Cache: Don't mask off the object event mask when printing it Don't mask off the object event mask when printing it. That way it can be seen if threre are bits set that shouldn't be. Signed-off-by: David Howells --- fs/cachefiles/namei.c | 3 +-- fs/fscache/object-list.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index b0b5f7cdfffa..8c01c5fcdf75 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, fscache_object_states[object->fscache.state], object->fscache.flags, work_busy(&object->fscache.work), - object->fscache.events, - object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); + object->fscache.events, object->fscache.event_mask); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", prefix, object->fscache.n_ops, object->fscache.n_in_progress, object->fscache.n_exclusive); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index ebe29c581380..f27c89d17885 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->n_in_progress, obj->n_exclusive, atomic_read(&obj->n_reads), - obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, + obj->event_mask, obj->events, obj->flags, work_busy(&obj->work)); -- cgit v1.2.1 From 75bc411388f4aeb9fb0381bd56eb5d67193ed9a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:48 +0000 Subject: FS-Cache: Limit the number of I/O error reports for a cache Limit the number of I/O error reports for a cache to 1 to prevent massive amounts of noise. After the first I/O error the cache is taken off line automatically, so must be restarted to resume caching. Signed-off-by: David Howells --- fs/fscache/cache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 6a3c48abd677..b52aed1dca97 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache); */ void fscache_io_error(struct fscache_cache *cache) { - set_bit(FSCACHE_IOERROR, &cache->flags); - - printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", - cache->ops->name); + if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) + printk(KERN_ERR "FS-Cache:" + " Cache '%s' stopped due to I/O error\n", + cache->ops->name); } EXPORT_SYMBOL(fscache_io_error); -- cgit v1.2.1 From 8d76349d359064859217dc292dc8733e209705af Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:48 +0000 Subject: FS-Cache: Exclusive op submission can BUG if there's been an I/O error The function to submit an exclusive op (fscache_submit_exclusive_op()) can BUG if there's been an I/O error because it may see the parent cache object in an unexpected state. It should only BUG if there hasn't been an I/O error. In this case the problem was produced by remounting the cache partition to be R/O. The EROFS state was detected and the cache was aborted, but not everything handled the aborting correctly. SysRq : Emergency Remount R/O EXT4-fs (sda6): re-mounted. Opts: (null) Emergency Remount complete CacheFiles: I/O Error: Failed to update xattr with error -30 FS-Cache: Cache cachefiles stopped due to I/O error ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:128! invalid opcode: 0000 [#1] SMP CPU 0 Modules linked in: cachefiles nfs fscache auth_rpcgss nfs_acl lockd sunrpc Pid: 6612, comm: kworker/u:2 Not tainted 3.1.0-rc8-fsdevel+ #1093 /DG965RY RIP: 0010:[] [] fscache_submit_exclusive_op+0x2ad/0x2c2 [fscache] RSP: 0018:ffff880000853d40 EFLAGS: 00010206 RAX: ffff880038ac72a8 RBX: ffff8800181f2260 RCX: ffffffff81f2b2b0 RDX: 0000000000000001 RSI: ffffffff8179a478 RDI: ffff8800181f2280 RBP: ffff880000853d60 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff880038ac7268 R13: ffff8800181f2280 R14: ffff88003a359190 R15: 000000010122b162 FS: 0000000000000000(0000) GS:ffff88003bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000034cc4a77f0 CR3: 0000000010e96000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/u:2 (pid: 6612, threadinfo ffff880000852000, task ffff880014c3c040) Stack: ffff8800181f2260 ffff8800181f2310 ffff880038ac7268 ffff8800181f2260 ffff880000853dc0 ffffffffa0072375 ffff880037ecfe00 ffff88003a359198 ffff880000853dc0 0000000000000246 0000000000000000 ffff88000a91d308 Call Trace: [] fscache_object_work_func+0x792/0xe65 [fscache] [] process_one_work+0x1eb/0x37f [] ? process_one_work+0x18d/0x37f [] ? fscache_enqueue_dependents+0xd8/0xd8 [fscache] [] worker_thread+0x15a/0x21a [] ? rescuer_thread+0x188/0x188 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? finish_task_switch+0x45/0xc0 [] ? retint_restore_args+0xe/0xe [] ? __init_kthread_worker+0x53/0x53 [] ? gs_change+0xb/0xb Signed-off-by: David Howells --- fs/fscache/internal.h | 1 + fs/fscache/object.c | 23 +++++++++++++++++------ fs/fscache/operation.c | 13 ++++++++++--- 3 files changed, 28 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c81179303930..dcb3e1d5dbf6 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -288,6 +288,7 @@ extern const struct file_operations fscache_stats_fops; static inline void fscache_raise_event(struct fscache_object *object, unsigned event) { + BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); if (!test_and_set_bit(event, &object->events) && test_bit(event, &object->event_mask)) fscache_enqueue_object(object); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 2ef8a082a272..2c512cbac380 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -103,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object) { enum fscache_object_state new_state; struct fscache_cookie *cookie; + int event; ASSERT(object != NULL); @@ -275,7 +276,8 @@ static void fscache_object_state_machine(struct fscache_object *object) /* determine the transition from a lookup state */ lookup_transit: - switch (fls(object->events & object->event_mask) - 1) { + event = fls(object->events & object->event_mask) - 1; + switch (event) { case FSCACHE_OBJECT_EV_WITHDRAW: case FSCACHE_OBJECT_EV_RETIRE: case FSCACHE_OBJECT_EV_RELEASE: @@ -292,7 +294,8 @@ lookup_transit: /* determine the transition from an active state */ active_transit: - switch (fls(object->events & object->event_mask) - 1) { + event = fls(object->events & object->event_mask) - 1; + switch (event) { case FSCACHE_OBJECT_EV_WITHDRAW: case FSCACHE_OBJECT_EV_RETIRE: case FSCACHE_OBJECT_EV_RELEASE: @@ -314,7 +317,8 @@ active_transit: /* determine the transition from a terminal state */ terminal_transit: - switch (fls(object->events & object->event_mask) - 1) { + event = fls(object->events & object->event_mask) - 1; + switch (event) { case FSCACHE_OBJECT_EV_WITHDRAW: new_state = FSCACHE_OBJECT_WITHDRAWING; goto change_state; @@ -347,8 +351,8 @@ done: unsupported_event: printk(KERN_ERR "FS-Cache:" - " Unsupported event %lx [mask %lx] in state %s\n", - object->events, object->event_mask, + " Unsupported event %d [%lx/%lx] in state %s\n", + event, object->events, object->event_mask, fscache_object_states[object->state]); BUG(); } @@ -945,7 +949,7 @@ static void fscache_invalidate_object(struct fscache_object *object) spin_lock(&cookie->lock); if (fscache_submit_exclusive_op(object, op) < 0) - BUG(); + goto submit_op_failed; spin_unlock(&cookie->lock); fscache_put_operation(op); @@ -960,4 +964,11 @@ static void fscache_invalidate_object(struct fscache_object *object) */ fscache_invalidation_complete(cookie); _leave(""); + return; + +submit_op_failed: + spin_unlock(&cookie->lock); + kfree(op); + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); + _leave(" [EIO]"); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index c58dbe613266..9e6b7d232bb1 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -84,6 +84,8 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + int ret; + _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); @@ -116,6 +118,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + ret = 0; } else if (object->state == FSCACHE_OBJECT_CREATING) { op->object = object; object->n_ops++; @@ -123,13 +126,17 @@ int fscache_submit_exclusive_op(struct fscache_object *object, atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); + ret = 0; } else { - /* not allowed to submit ops in any other state */ - BUG(); + /* If we're in any other state, there must have been an I/O + * error of some nature. + */ + ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); + ret = -EIO; } spin_unlock(&object->lock); - return 0; + return ret; } /* -- cgit v1.2.1 From 8c209ce721444a61b61d9e772746c721e4d8d1e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:49 +0000 Subject: NFS: nfs_migrate_page() does not wait for FS-Cache to finish with a page nfs_migrate_page() does not wait for FS-Cache to finish with a page, probably leading to the following bad-page-state: BUG: Bad page state in process python-bin pfn:17d39b page:ffffea00053649e8 flags:004000000000100c count:0 mapcount:0 mapping:(null) index:38686 (Tainted: G B ---------------- ) Pid: 31053, comm: python-bin Tainted: G B ---------------- 2.6.32-71.24.1.el6.x86_64 #1 Call Trace: [] bad_page+0x107/0x160 [] free_hot_cold_page+0x1c9/0x220 [] __pagevec_free+0x59/0xb0 [] ? flush_tlb_others_ipi+0x128/0x130 [] release_pages+0x21c/0x250 [] ? remove_migration_pte+0x28a/0x2b0 [] ? mem_cgroup_get_reclaim_stat_from_page+0x18/0x70 [] ____pagevec_lru_add+0x167/0x180 [] __lru_cache_add+0x58/0x70 [] lru_cache_add_lru+0x21/0x40 [] putback_lru_page+0x69/0x100 [] migrate_pages+0x13d/0x5d0 [] ? ____pagevec_lru_add+0x167/0x180 [] ? compaction_alloc+0x0/0x370 [] compact_zone+0x4cc/0x600 [] ? get_page_from_freelist+0x15c/0x820 [] ? check_preempt_wakeup+0x1c4/0x3c0 [] compact_zone_order+0x7e/0xb0 [] try_to_compact_pages+0x109/0x170 [] __alloc_pages_nodemask+0x5ed/0x850 [] ? thread_return+0x4e/0x778 [] alloc_pages_vma+0x93/0x150 [] do_huge_pmd_anonymous_page+0x135/0x340 [] ? rwsem_down_read_failed+0x26/0x30 [] handle_mm_fault+0x245/0x2b0 [] do_page_fault+0x123/0x3a0 [] page_fault+0x25/0x30 nfs_migrate_page() calls nfs_fscache_release_page() which doesn't actually wait - even if __GFP_WAIT is set. The reason that doesn't wait is that fscache_maybe_release_page() might deadlock the allocator as the work threads writing to the cache may all end up sleeping on memory allocation. However, I wonder if that is actually a problem. There are a number of things I can do to deal with this: (1) Make nfs_migrate_page() wait. (2) Make fscache_maybe_release_page() honour the __GFP_WAIT flag. (3) Set a timeout around the wait. (4) Make nfs_migrate_page() return an error if the page is still busy. For the moment, I'll select (2) and (4). Signed-off-by: David Howells Acked-by: Jeff Layton --- fs/fscache/internal.h | 1 + fs/fscache/page.c | 19 ++++++++++++++----- fs/fscache/stats.c | 6 ++++-- fs/nfs/write.c | 3 ++- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index dcb3e1d5dbf6..88a48ccb7d9e 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -200,6 +200,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing; extern atomic_t fscache_n_store_vmscan_gone; extern atomic_t fscache_n_store_vmscan_busy; extern atomic_t fscache_n_store_vmscan_cancelled; +extern atomic_t fscache_n_store_vmscan_wait; extern atomic_t fscache_n_marks; extern atomic_t fscache_n_uncaches; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 4dbbca162620..f9b2fb3ae492 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, _enter("%p,%p,%x", cookie, page, gfp); +try_again: rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); if (!val) { @@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, return true; page_busy: - /* we might want to wait here, but that could deadlock the allocator as - * the work threads writing to the cache may all end up sleeping - * on memory allocation */ - fscache_stat(&fscache_n_store_vmscan_busy); - return false; + /* We will wait here if we're allowed to, but that could deadlock the + * allocator as the work threads writing to the cache may all end up + * sleeping on memory allocation, so we may need to impose a timeout + * too. */ + if (!(gfp & __GFP_WAIT)) { + fscache_stat(&fscache_n_store_vmscan_busy); + return false; + } + + fscache_stat(&fscache_n_store_vmscan_wait); + __fscache_wait_on_page_write(cookie, page); + gfp &= ~__GFP_WAIT; + goto try_again; } EXPORT_SYMBOL(__fscache_maybe_release_page); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 51cdaee14109..8179e8bc4a3d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing; atomic_t fscache_n_store_vmscan_gone; atomic_t fscache_n_store_vmscan_busy; atomic_t fscache_n_store_vmscan_cancelled; +atomic_t fscache_n_store_vmscan_wait; atomic_t fscache_n_marks; atomic_t fscache_n_uncaches; @@ -232,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_radix_deletes), atomic_read(&fscache_n_store_pages_over_limit)); - seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", + seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n", atomic_read(&fscache_n_store_vmscan_not_storing), atomic_read(&fscache_n_store_vmscan_gone), atomic_read(&fscache_n_store_vmscan_busy), - atomic_read(&fscache_n_store_vmscan_cancelled)); + atomic_read(&fscache_n_store_vmscan_cancelled), + atomic_read(&fscache_n_store_vmscan_wait)); seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", atomic_read(&fscache_n_op_pend), diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5209916e1222..b673be31590e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1794,7 +1794,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (PagePrivate(page)) return -EBUSY; - nfs_fscache_release_page(page, GFP_KERNEL); + if (!nfs_fscache_release_page(page, GFP_KERNEL)) + return -EBUSY; return migrate_page(mapping, newpage, page, mode); } -- cgit v1.2.1 From 969695215f9a865cbf64c4ce3742ac9fc57fffed Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 13:34:49 +0000 Subject: FS-Cache: Add transition to handle invalidate immediately after lookup Add a missing transition to the FS-Cache object state machine to handle an invalidation event occuring between the back end completing the object lookup by calling fscache_obtained_object() (which moves to state OBJECT_AVAILABLE) and the backend returning to fscache_lookup_object() and thence to fscache_object_state_machine() which then does a goto lookup_transit to handle the transition - but lookup_transit doesn't handle EV_INVALIDATE. Without this, the following BUG can be logged: FS-Cache: Unsupported event 2 [5/f7] in state OBJECT_AVAILABLE ------------[ cut here ]------------ kernel BUG at fs/fscache/object.c:357! Where event 2 is EV_INVALIDATE. Signed-off-by: David Howells --- fs/fscache/object.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 2c512cbac380..50d41c180211 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -284,6 +284,9 @@ lookup_transit: case FSCACHE_OBJECT_EV_ERROR: new_state = FSCACHE_OBJECT_LC_DYING; goto change_state; + case FSCACHE_OBJECT_EV_INVALIDATE: + new_state = FSCACHE_OBJECT_INVALIDATING; + goto change_state; case FSCACHE_OBJECT_EV_REQUEUE: goto done; case -1: -- cgit v1.2.1 From a4ff146881c2764d7c3e4ef710e7c27d521ddd51 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Dec 2012 16:31:49 +0000 Subject: NFS4: Open files for fscaching nfs4_file_open() should open files for fscaching. Signed-off-by: David Howells --- fs/nfs/fscache.c | 1 + fs/nfs/nfs4file.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index c817787fbdb4..24d1d1c5fcaf 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) nfs_fscache_inode_unlock(inode); } } +EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie); /* * Replace a per-inode cookie due to revalidation detecting a file having diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e7699308364a..08ddcccb8887 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -5,6 +5,7 @@ */ #include #include "internal.h" +#include "fscache.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); + nfs_fscache_set_inode_cookie(inode, filp); err = 0; out_put_ctx: -- cgit v1.2.1 From 9c04caa81b876faee5f1cc6eaad76dd7021ab8ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Dec 2012 18:08:02 +0000 Subject: FS-Cache: Fix signal handling during waits wait_on_bit() with TASK_INTERRUPTIBLE returns 1 rather than a negative error code, so change what we check for. This means that the signal handling in fscache_wait_for_retrieval_activation() should now work properly. Without this, the following bug can be seen if CTRL-C is pressed during fscache read operation: FS-Cache: Assertion failed 2 == 3 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/page.c:347! invalid opcode: 0000 [#1] SMP Modules linked in: cachefiles(F) nfsv4(F) nfsv3(F) nfsv2(F) nfs(F) fscache(F) auth_rpcgss(F) nfs_acl(F) lockd(F) sunrpc(F) CPU 1 Pid: 15006, comm: slurp-q Tainted: GF 3.7.0-rc8-fsdevel+ #411 /DG965RY RIP: 0010:[] [] fscache_wait_for_retrieval_activation+0x167/0x177 [fscache] RSP: 0018:ffff88002a4c39a8 EFLAGS: 00010292 RAX: 000000000000001a RBX: ffff88002d3dc158 RCX: 0000000000008685 RDX: ffffffff8102ccd6 RSI: 0000000000000001 RDI: ffffffff8102d1d6 RBP: ffff88002a4c39c8 R08: 0000000000000002 R09: 0000000000000000 R10: ffffffff8163afa0 R11: ffff88003bd11900 R12: ffffffffa00868c8 R13: ffff880028306458 R14: ffff88002d3dc1b0 R15: ffff88001372e538 FS: 00007f17426a0700(0000) GS:ffff88003bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f1742494a44 CR3: 0000000031bd7000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process slurp-q (pid: 15006, threadinfo ffff88002a4c2000, task ffff880023de3040) Stack: ffff88002d3dc158 ffff88001372e538 ffff88002a4c3ab4 ffff8800283064e0 ffff88002a4c3a38 ffffffffa0080f6d 0000000000000000 ffff880023de3040 ffff88002a4c3ac8 ffffffff810ac8ae ffff880028306458 ffff88002a4c3bc8 Call Trace: [] __fscache_read_or_alloc_pages+0x24f/0x4bc [fscache] [] ? __alloc_pages_nodemask+0x195/0x75c [] __nfs_readpages_from_fscache+0x86/0x13d [nfs] [] nfs_readpages+0x186/0x1bd [nfs] [] ? alloc_pages_current+0xc7/0xe4 [] ? __page_cache_alloc+0x84/0x91 [] ? __do_page_cache_readahead+0xa6/0x2e0 [] __do_page_cache_readahead+0x237/0x2e0 [] ? __do_page_cache_readahead+0xa6/0x2e0 [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x359/0x382 [] page_cache_sync_readahead+0x38/0x3a [] generic_file_aio_read+0x26b/0x637 [] ? nfs_mark_delegation_referenced+0xb/0xb [nfsv4] [] nfs_file_read+0xaa/0xcf [nfs] [] do_sync_read+0x91/0xd1 [] vfs_read+0x9b/0x144 [] sys_read+0x44/0x75 [] system_call_fastpath+0x16/0x1b Signed-off-by: David Howells --- fs/fscache/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index f9b2fb3ae492..5b5d9081c8b2 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -319,7 +319,7 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { + TASK_INTERRUPTIBLE) != 0) { ret = fscache_cancel_op(&op->op); if (ret == 0) return -ERESTARTSYS; -- cgit v1.2.1 From 7ef001e937e8b9cbedb2fc1c31dd681ac3b31927 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Dec 2012 10:41:26 +0000 Subject: FS-Cache: One of the write operation paths doesn't set the object state In fscache_write_op(), if the object is determined to have become inactive or to have lost its cookie, we don't move the operation state from in-progress, and so an assertion in fscache_put_operation() fails with an assertion (see below). Instrumenting fscache_op_work_func() indicates that it called fscache_write_op() before calling fscache_put_operation() - where the assertion failed. The assertion at line 433 indicates that the operation state is IN_PROGRESS rather than being COMPLETE or CANCELLED. Instrumenting fscache_write_op() showed that it was being called on an object that had had its cookie removed and that this was due to relinquishment of the cookie by the netfs. At this point fscache no longer has access to the pages of netfs data that were requested to be written, and so simply cancelling the operation is the thing to do. FS-Cache: Assertion failed 3 == 5 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/operation.c:433! invalid opcode: 0000 [#1] SMP Modules linked in: cachefiles(F) nfsv4(F) nfsv3(F) nfsv2(F) nfs(F) fscache(F) auth_rpcgss(F) nfs_acl(F) lockd(F) sunrpc(F) CPU 0 Pid: 1035, comm: kworker/u:3 Tainted: GF 3.7.0-rc8-fsdevel+ #411 /DG965RY RIP: 0010:[] [] fscache_put_operation+0x11a/0x2ed [fscache] RSP: 0018:ffff88003e32bcf8 EFLAGS: 00010296 RAX: 000000000000000f RBX: ffff88001818eb78 RCX: ffffffff6c102000 RDX: ffffffff8102d1ad RSI: ffffffff6c102000 RDI: ffffffff8102d1d6 RBP: ffff88003e32bd18 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffa00811da R13: 0000000000000001 R14: 0000000100625d26 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88003bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fff7dd31c68 CR3: 000000003d730000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/u:3 (pid: 1035, threadinfo ffff88003e32a000, task ffff88003bb38080) Stack: ffffffff8102d1ad ffff88001818eb78 ffffffffa00811da 0000000000000001 ffff88003e32bd48 ffffffffa007f0ad ffff88001818eb78 ffffffff819583c0 ffff88003df24e00 ffff88003882c3e0 ffff88003e32bde8 ffffffff81042de0 Call Trace: [] ? vprintk_emit+0x3c6/0x41a [] ? __fscache_read_or_alloc_pages+0x4bc/0x4bc [fscache] [] fscache_op_work_func+0xec/0x123 [fscache] [] process_one_work+0x21c/0x3b0 [] ? process_one_work+0x1be/0x3b0 [] ? fscache_operation_gc+0x23e/0x23e [fscache] [] worker_thread+0x202/0x2df [] ? rescuer_thread+0x18e/0x18e [] kthread+0xd0/0xd8 [] ? _raw_spin_unlock_irq+0x29/0x3e [] ? __init_kthread_worker+0x55/0x55 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_worker+0x55/0x55 Signed-off-by: David Howells --- fs/fscache/page.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 5b5d9081c8b2..ef0218f5080d 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -699,9 +699,27 @@ static void fscache_write_op(struct fscache_operation *_op) spin_lock(&object->lock); cookie = object->cookie; - if (!fscache_object_is_active(object) || !cookie) { + if (!fscache_object_is_active(object)) { + /* If we get here, then the on-disk cache object likely longer + * exists, so we should just cancel this write operation. + */ spin_unlock(&object->lock); - _leave(""); + op->op.state = FSCACHE_OP_ST_CANCELLED; + _leave(" [inactive]"); + return; + } + + if (!cookie) { + /* If we get here, then the cookie belonging to the object was + * detached, probably by the cookie being withdrawn due to + * memory pressure, which means that the pages we might write + * to the cache from no longer exist - therefore, we can just + * cancel this write operation. + */ + spin_unlock(&object->lock); + op->op.state = FSCACHE_OP_ST_CANCELLED; + _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", + _op->flags, _op->state, object->state, object->flags); return; } -- cgit v1.2.1 From 1f372dff1da37e2b36ae9085368fa46896398598 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Dec 2012 20:03:13 +0000 Subject: FS-Cache: Mark cancellation of in-progress operation Mark as cancelled an operation that is in progress rather than pending at the time it is cancelled, and call fscache_complete_op() to cancel an operation so that blocked ops can be started. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 2 +- fs/fscache/operation.c | 7 ++++--- fs/fscache/page.c | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 7a9d574b961c..746ce532e130 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -484,7 +484,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op) } } - fscache_op_complete(op); + fscache_op_complete(op, true); _leave(""); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 9e6b7d232bb1..36c59604130d 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -363,9 +363,9 @@ void fscache_cancel_all_ops(struct fscache_object *object) } /* - * Record the completion of an in-progress operation. + * Record the completion or cancellation of an in-progress operation. */ -void fscache_op_complete(struct fscache_operation *op) +void fscache_op_complete(struct fscache_operation *op, bool cancelled) { struct fscache_object *object = op->object; @@ -380,7 +380,8 @@ void fscache_op_complete(struct fscache_operation *op) spin_lock(&object->lock); - op->state = FSCACHE_OP_ST_COMPLETE; + op->state = cancelled ? + FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ef0218f5080d..8a92b9fabe83 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -171,7 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_abort_object(object); } - fscache_op_complete(op); + fscache_op_complete(op, true); _leave(""); } @@ -704,7 +704,7 @@ static void fscache_write_op(struct fscache_operation *_op) * exists, so we should just cancel this write operation. */ spin_unlock(&object->lock); - op->op.state = FSCACHE_OP_ST_CANCELLED; + fscache_op_complete(&op->op, false); _leave(" [inactive]"); return; } @@ -717,7 +717,7 @@ static void fscache_write_op(struct fscache_operation *_op) * cancel this write operation. */ spin_unlock(&object->lock); - op->op.state = FSCACHE_OP_ST_CANCELLED; + fscache_op_complete(&op->op, false); _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", _op->flags, _op->state, object->state, object->flags); return; @@ -755,7 +755,7 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_end_page_write(object, page); if (ret < 0) { fscache_abort_object(object); - fscache_op_complete(&op->op); + fscache_op_complete(&op->op, true); } else { fscache_enqueue_operation(&op->op); } @@ -770,7 +770,7 @@ superseded: spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); - fscache_op_complete(&op->op); + fscache_op_complete(&op->op, true); _leave(""); } -- cgit v1.2.1 From 91c7fbbf63f33c77d8d28de624834a21888842bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Dec 2012 11:02:22 +0000 Subject: FS-Cache: Clear remaining page count on retrieval cancellation Provide fscache_cancel_op() with a pointer to a function it should invoke under lock if it cancels an operation. Use this to clear the remaining page count upon cancellation of a pending retrieval operation so that fscache_release_retrieval_op() doesn't get an assertion failure (see below). This can happen when a signal occurs, say from CTRL-C being pressed during data retrieval. FS-Cache: Assertion failed 3 == 0 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/page.c:237! invalid opcode: 0000 [#641] SMP Modules linked in: cachefiles(F) nfsv4(F) nfsv3(F) nfsv2(F) nfs(F) fscache(F) auth_rpcgss(F) nfs_acl(F) lockd(F) sunrpc(F) CPU 0 Pid: 6075, comm: slurp-q Tainted: GF D 3.7.0-rc8-fsdevel+ #411 /DG965RY RIP: 0010:[] [] fscache_release_retrieval_op+0x75/0xff [fscache] RSP: 0000:ffff88001c6d7988 EFLAGS: 00010296 RAX: 000000000000000f RBX: ffff880014cdfe00 RCX: ffffffff6c102000 RDX: ffffffff8102d1ad RSI: ffffffff6c102000 RDI: ffffffff8102d1d6 RBP: ffff88001c6d7998 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffe00 R13: ffff88001c6d7ab4 R14: ffff88001a8638a0 R15: ffff88001552b190 FS: 00007f877aaf0700(0000) GS:ffff88003bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fff11378fd2 CR3: 000000001c6c6000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process slurp-q (pid: 6075, threadinfo ffff88001c6d6000, task ffff88001c6c4080) Stack: ffffffffa007ec07 ffff880014cdfe00 ffff88001c6d79c8 ffffffffa007db4d ffffffffa007ec07 ffff880014cdfe00 00000000fffffe00 ffff88001c6d7ab4 ffff88001c6d7a38 ffffffffa008116d 0000000000000000 ffff88001c6c4080 Call Trace: [] ? fscache_cancel_op+0x194/0x1cf [fscache] [] fscache_put_operation+0x135/0x2ed [fscache] [] ? fscache_cancel_op+0x194/0x1cf [fscache] [] __fscache_read_or_alloc_pages+0x413/0x4bc [fscache] [] ? __alloc_pages_nodemask+0x195/0x75c [] __nfs_readpages_from_fscache+0x86/0x13d [nfs] [] nfs_readpages+0x186/0x1bd [nfs] [] ? alloc_pages_current+0xc7/0xe4 [] ? __page_cache_alloc+0x84/0x91 [] ? __do_page_cache_readahead+0xa6/0x2e0 [] __do_page_cache_readahead+0x237/0x2e0 [] ? __do_page_cache_readahead+0xa6/0x2e0 [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x359/0x382 [] page_cache_sync_readahead+0x38/0x3a [] generic_file_aio_read+0x26b/0x637 [] ? nfs_mark_delegation_referenced+0xb/0xb [nfsv4] [] nfs_file_read+0xaa/0xcf [nfs] [] do_sync_read+0x91/0xd1 [] vfs_read+0x9b/0x144 [] sys_read+0x44/0x75 [] system_call_fastpath+0x16/0x1b Signed-off-by: David Howells --- fs/fscache/internal.h | 3 ++- fs/fscache/operation.c | 5 ++++- fs/fscache/page.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 88a48ccb7d9e..ee38fef4be51 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -121,7 +121,8 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *, + void (*)(struct fscache_operation *)); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 36c59604130d..762a9ec4ffa4 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -298,7 +298,8 @@ void fscache_start_operations(struct fscache_object *object) /* * cancel an operation that's pending on an object */ -int fscache_cancel_op(struct fscache_operation *op) +int fscache_cancel_op(struct fscache_operation *op, + void (*do_cancel)(struct fscache_operation *)) { struct fscache_object *object = op->object; int ret; @@ -316,6 +317,8 @@ int fscache_cancel_op(struct fscache_operation *op) ASSERT(!list_empty(&op->pend_link)); fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); + if (do_cancel) + do_cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 8a92b9fabe83..ff000e52072d 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -302,6 +302,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) return 0; } +/* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + op->n_pages = 0; +} + /* * wait for an object to become active (or dead) */ @@ -320,7 +331,7 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object, if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(&op->op); + ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); if (ret == 0) return -ERESTARTSYS; @@ -338,8 +349,8 @@ check_if_dead: return -ENOBUFS; } if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]", __func__, op->op.state); - fscache_cancel_op(&op->op); + pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state); + fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); fscache_stat(stat_object_dead); return -ENOBUFS; } -- cgit v1.2.1 From 70b31c4c88e253f4c2066367401118edab957614 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:53:50 +0100 Subject: hpfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/hpfs/file.c | 20 +++++++++++++------- fs/hpfs/hpfs_fn.h | 1 + fs/hpfs/inode.c | 5 ++++- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 89d2a5803ae3..fbfe2df5624b 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno) return disk_secno; } -static void hpfs_truncate(struct inode *i) +void hpfs_truncate(struct inode *i) { if (IS_IMMUTABLE(i)) return /*-EPERM*/; hpfs_lock_assert(i->i_sb); @@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page) return block_read_full_page(page,hpfs_get_block); } +static void hpfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + hpfs_truncate(inode); + } +} + static int hpfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + hpfs_write_failed(mapping, pos + len); return ret; } @@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops = const struct inode_operations hpfs_file_iops = { - .truncate = hpfs_truncate, .setattr = hpfs_setattr, }; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 7102aaecc244..b7ae286646b5 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *, /* file.c */ int hpfs_file_fsync(struct file *, loff_t, loff_t, int); +void hpfs_truncate(struct inode *); extern const struct file_operations hpfs_file_ops; extern const struct inode_operations hpfs_file_iops; extern const struct address_space_operations hpfs_aops; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 804a9a842cbc..5dc06c837105 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) goto out_unlock; + + truncate_setsize(inode, attr->ia_size); + hpfs_truncate(inode); } setattr_copy(inode, attr); -- cgit v1.2.1 From 86dd07d66a2f7284cfe2b771d062dd6c0e331766 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:54:25 +0100 Subject: jfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/jfs/file.c | 6 ++++-- fs/jfs/inode.c | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 9d3afd157f99..dd7442c58358 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) iattr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); - rc = vmtruncate(inode, iattr->ia_size); + rc = inode_newsize_ok(inode, iattr->ia_size); if (rc) return rc; + + truncate_setsize(inode, iattr->ia_size); + jfs_truncate(inode); } setattr_copy(inode, iattr); @@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) } const struct inode_operations jfs_file_inode_operations = { - .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4692bf3ca8cb..b7dc47ba675e 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); } +static void jfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + jfs_truncate(inode); + } +} + static int jfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, jfs_get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + jfs_write_failed(mapping, pos + len); return ret; } @@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; ssize_t ret; @@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + jfs_write_failed(mapping, end); } return ret; -- cgit v1.2.1 From d506848567b529e57dfbcc4e28747b9211ffb7e5 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:55:07 +0100 Subject: hfsplus: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/hfsplus/inode.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 2172aa5976f5..799b336b59f9 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -28,6 +28,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hfsplus_get_block, wbc); } +static void hfsplus_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + hfsplus_file_truncate(inode); + } +} + static int hfsplus_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -38,11 +48,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + hfsplus_write_failed(mapping, pos + len); return ret; } @@ -116,6 +123,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; ssize_t ret; @@ -131,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + hfsplus_write_failed(mapping, end); } return ret; @@ -300,10 +308,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); - - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; + truncate_setsize(inode, attr->ia_size); + hfsplus_file_truncate(inode); } setattr_copy(inode, attr); @@ -358,7 +364,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, - .truncate = hfsplus_file_truncate, .setattr = hfsplus_setattr, .setxattr = hfsplus_setxattr, .getxattr = hfsplus_getxattr, -- cgit v1.2.1 From 5dfc2821e87893695bf4751fcbbdb56f42fa2985 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:55:42 +0100 Subject: logfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/logfs/readwrite.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index e1a3b6bf6324..9a59cbade2fb 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target) logfs_put_wblocks(sb, NULL, 1); } - if (!err) - err = vmtruncate(inode, target); + if (!err) { + err = inode_newsize_ok(inode, target); + if (err) + goto out; + + truncate_setsize(inode, target); + } + out: /* I don't trust error recovery yet. */ WARN_ON(err); return err; -- cgit v1.2.1 From 7fc7cd00f616e38973fe3acd0dc7e473da94c52e Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:56:25 +0100 Subject: minix: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/minix/file.c | 6 ++++-- fs/minix/inode.c | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/minix/file.c b/fs/minix/file.c index 4493ce695ab8..adc6f5494231 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + + truncate_setsize(inode, attr->ia_size); + minix_truncate(inode); } setattr_copy(inode, attr); @@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) } const struct inode_operations minix_file_inode_operations = { - .truncate = minix_truncate, .setattr = minix_setattr, .getattr = minix_getattr, }; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 4fc5f8ab1c44..99541cceb584 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, minix_get_block); } +static void minix_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + minix_truncate(inode); + } +} + static int minix_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, minix_get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + minix_write_failed(mapping, pos + len); return ret; } -- cgit v1.2.1 From 3e7a806928ac2dcae1423d6879e4f7e86e59bb9e Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:57:03 +0100 Subject: ncpfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/ncpfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d7e9fe77188a..1acdad7fcec7 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -976,9 +976,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; if (attr->ia_size != i_size_read(inode)) { - result = vmtruncate(inode, attr->ia_size); - if (result) - goto out; + truncate_setsize(inode, attr->ia_size); mark_inode_dirty(inode); } } -- cgit v1.2.1 From 2d1b399b22a8042edbaf41b1f2086d4183422ce4 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:57:37 +0100 Subject: nilfs2: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/nilfs2/file.c | 1 - fs/nilfs2/inode.c | 24 +++++++++++++++--------- fs/nilfs2/nilfs.h | 1 + fs/nilfs2/recovery.c | 3 ++- 4 files changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 16f35f7423c5..61946883025c 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = { }; const struct inode_operations nilfs_file_inode_operations = { - .truncate = nilfs_truncate, .setattr = nilfs_setattr, .permission = nilfs_permission, .fiemap = nilfs_fiemap, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 4d31d2cca7fd..6b49f14eac8c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page) return ret; } +void nilfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + nilfs_truncate(inode); + } +} + static int nilfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, err = block_write_begin(mapping, pos, len, flags, pagep, nilfs_get_block); if (unlikely(err)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - + nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); } return err; @@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; ssize_t size; @@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + nilfs_write_failed(mapping, end); } return size; @@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); - - err = vmtruncate(inode, iattr->ia_size); - if (unlikely(err)) - goto out_err; + truncate_setsize(inode, iattr->ia_size); + nilfs_truncate(inode); } setattr_copy(inode, iattr); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 74cece80e9a3..9bc72dec3fa6 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); +extern void nilfs_write_failed(struct address_space *mapping, loff_t to); int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index f1626f5011c5..ff00a0b7acb9 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) - vmtruncate(inode, isize); + nilfs_write_failed(inode->i_mapping, + pos + blocksize); goto failed_inode; } -- cgit v1.2.1 From 9014da7525dffef69131f717decf262e08ff3d58 Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:58:36 +0100 Subject: ntfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Reviewed-by: Anton Altaparmakov Signed-off-by: Al Viro --- fs/ntfs/file.c | 16 +++++++++++++--- fs/ntfs/inode.c | 8 ++++++-- fs/ntfs/inode.h | 4 ++++ 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1ecf46448f85..5b2d4f0853ac 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1762,6 +1762,16 @@ err_out: return err; } +static void ntfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + ntfs_truncate_vfs(inode); + } +} + /** * ntfs_file_buffered_write - * @@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * allocated space, which is not a disaster. */ i_size = i_size_read(vi); - if (pos + bytes > i_size) - vmtruncate(vi, i_size); + if (pos + bytes > i_size) { + ntfs_write_failed(mapping, pos + bytes); + } break; } } @@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = { const struct inode_operations ntfs_file_inode_ops = { #ifdef NTFS_RW - .truncate = ntfs_truncate_vfs, .setattr = ntfs_setattr, #endif /* NTFS_RW */ }; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1d27331e6fc9..d3e118cc6ffa 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2866,9 +2866,11 @@ conv_err_out: * * See ntfs_truncate() description above for details. */ +#ifdef NTFS_RW void ntfs_truncate_vfs(struct inode *vi) { ntfs_truncate(vi); } +#endif /** * ntfs_setattr - called from notify_change() when an attribute is being changed @@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) NInoCompressed(ni) ? "compressed" : "encrypted"); err = -EOPNOTSUPP; - } else - err = vmtruncate(vi, attr->ia_size); + } else { + truncate_setsize(vi, attr->ia_size); + ntfs_truncate_vfs(vi); + } if (err || ia_valid == ATTR_SIZE) goto out; } else { diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index db29695f845c..76b6cfb579d7 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi) return; } +#else + +static inline void ntfs_truncate_vfs(struct inode *vi) {} + #endif /* NTFS_RW */ #endif /* _LINUX_NTFS_INODE_H */ -- cgit v1.2.1 From d30357f2f0ec0bfb67fd39f8f76d22d02d78631e Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:59:20 +0100 Subject: vfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/libfs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 35fc6e74cd88..916da8c4158b 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; int error; - WARN_ON_ONCE(inode->i_op->truncate); - error = inode_change_ok(inode, iattr); if (error) return error; -- cgit v1.2.1 From b911a6bdeef5848c468597d040e3407e0aee04ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Nov 2012 16:09:37 -0800 Subject: vfs: d_obtain_alias() needs to use "/" as default name. NFS appears to use d_obtain_alias() to create the root dentry rather than d_make_root. This can cause 'prepend_path()' to complain that the root has a weird name if an NFS filesystem is lazily unmounted. e.g. if "/mnt" is an NFS mount then { cd /mnt; umount -l /mnt ; ls -l /proc/self/cwd; } will cause a WARN message like WARNING: at /home/git/linux/fs/dcache.c:2624 prepend_path+0x1d7/0x1e0() ... Root dentry has weird name <> to appear in kernel logs. So change d_obtain_alias() to use "/" rather than "" as the anonymous name. Signed-off-by: NeilBrown Cc: Trond Myklebust Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 1782be3fc3ef..19153a0a810c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1559,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias); */ struct dentry *d_obtain_alias(struct inode *inode) { - static const struct qstr anonstring = { .name = "" }; + static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; struct dentry *res; -- cgit v1.2.1 From 836fb7e7b978e5f3b8b52e40838ddc50264723f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:05 -0500 Subject: vfs: make fstatat retry on ESTALE errors from getattr call Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/stat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index eae494630a36..d22199527880 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -74,7 +74,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, { struct path path; int error = -EINVAL; - int lookup_flags = 0; + unsigned int lookup_flags = 0; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0) @@ -84,13 +84,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - +retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = vfs_getattr(path.mnt, path.dentry, stat); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -- cgit v1.2.1 From 7955119e02d9fdf78a39fba8073f19ca6152613e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:06 -0500 Subject: vfs: fix readlinkat to retry on ESTALE Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/stat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index d22199527880..14f45459c83d 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -300,11 +300,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, struct path path; int error; int empty = 0; + unsigned int lookup_flags = LOOKUP_EMPTY; if (bufsiz <= 0) return -EINVAL; - error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); +retry: + error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); if (!error) { struct inode *inode = path.dentry->d_inode; @@ -318,6 +320,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, } } path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } } return error; } -- cgit v1.2.1 From 1ac12b4b6d707937f9de6d09622823b2fd0c93ef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:06 -0500 Subject: vfs: turn is_dir argument to kern_path_create into a lookup_flags arg Where we can pass in LOOKUP_DIRECTORY or LOOKUP_REVAL. Any other flags passed in here are currently ignored. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 25a41e02984b..8f8e41f6eb52 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3030,12 +3030,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct nameidata nd; int err2; - int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); + int error; + bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); + + /* + * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any + * other flags passed in are ignored! + */ + lookup_flags &= LOOKUP_REVAL; + + error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); if (error) return ERR_PTR(error); @@ -3099,13 +3109,14 @@ void done_path_create(struct path *path, struct dentry *dentry) } EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) +struct dentry *user_path_create(int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *tmp = getname(pathname); struct dentry *res; if (IS_ERR(tmp)) return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp->name, path, is_dir); + res = kern_path_create(dfd, tmp->name, path, lookup_flags); putname(tmp); return res; } @@ -3228,7 +3239,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) struct path path; int error; - dentry = user_path_create(dfd, pathname, &path, 1); + dentry = user_path_create(dfd, pathname, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); -- cgit v1.2.1 From 972567f14cbcd437e9a88a73836bbc2ee0720b5f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 16:00:10 -0500 Subject: vfs: fix mknodat to retry on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 8f8e41f6eb52..b70c191b7e2b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3172,12 +3172,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, struct dentry *dentry; struct path path; int error; + unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) return error; - - dentry = user_path_create(dfd, filename, &path, 0); +retry: + dentry = user_path_create(dfd, filename, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -3200,6 +3201,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, } out: done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From b76d8b82266077dc7098dd13f321a616099a1bd8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 16:04:09 -0500 Subject: vfs: fix mkdirat to retry once on an ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b70c191b7e2b..1beebc1a38c9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3243,8 +3243,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) struct dentry *dentry; struct path path; int error; + unsigned int lookup_flags = LOOKUP_DIRECTORY; - dentry = user_path_create(dfd, pathname, &path, LOOKUP_DIRECTORY); +retry: + dentry = user_path_create(dfd, pathname, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -3254,6 +3256,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!error) error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From f46d3567b223e41e1f2faeb82d3b74a6d84fc508 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:08 -0500 Subject: vfs: fix symlinkat to retry on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 1beebc1a38c9..b06a111591a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3521,12 +3521,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, struct filename *from; struct dentry *dentry; struct path path; + unsigned int lookup_flags = 0; from = getname(oldname); if (IS_ERR(from)) return PTR_ERR(from); - - dentry = user_path_create(newdfd, newname, &path, 0); +retry: + dentry = user_path_create(newdfd, newname, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; @@ -3535,6 +3536,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, if (!error) error = vfs_symlink(path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out_putname: putname(from); return error; -- cgit v1.2.1 From 442e31ca5a49e398351b2954b51f578353fdf210 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 16:15:38 -0500 Subject: vfs: fix linkat to retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b06a111591a8..6868699272bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3626,12 +3626,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; - +retry: error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; - new_dentry = user_path_create(newdfd, newname, &new_path, 0); + new_dentry = user_path_create(newdfd, newname, &new_path, + (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -3648,6 +3649,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); out_dput: done_path_create(&new_path, new_dentry); + if (retry_estale(error, how)) { + how |= LOOKUP_REVAL; + goto retry; + } out: path_put(&old_path); -- cgit v1.2.1 From 9e790bd65ce4cbfdff305a57b67b1a2cbe5d4335 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:09 -0500 Subject: vfs: add a flags argument to user_path_parent ...so we can pass in LOOKUP_REVAL. For now, nothing does yet. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 6868699272bf..19190618695f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2175,15 +2175,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, * path-walking is complete. */ static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd) +user_path_parent(int dfd, const char __user *path, struct nameidata *nd, + unsigned int flags) { struct filename *s = getname(path); int error; + /* only LOOKUP_REVAL is allowed in extra flags */ + flags &= LOOKUP_REVAL; + if (IS_ERR(s)) return s; - error = filename_lookup(dfd, s, LOOKUP_PARENT, nd); + error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); if (error) { putname(s); return ERR_PTR(error); @@ -3336,7 +3340,7 @@ static long do_rmdir(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; - name = user_path_parent(dfd, pathname, &nd); + name = user_path_parent(dfd, pathname, &nd, 0); if (IS_ERR(name)) return PTR_ERR(name); @@ -3432,7 +3436,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct nameidata nd; struct inode *inode = NULL; - name = user_path_parent(dfd, pathname, &nd); + name = user_path_parent(dfd, pathname, &nd, 0); if (IS_ERR(name)) return PTR_ERR(name); @@ -3827,13 +3831,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, struct filename *to; int error; - from = user_path_parent(olddfd, oldname, &oldnd); + from = user_path_parent(olddfd, oldname, &oldnd, 0); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, &newnd); + to = user_path_parent(newdfd, newname, &newnd, 0); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; -- cgit v1.2.1 From c6ee920698301febdf10df0b57039173a1edbd43 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 16:28:33 -0500 Subject: vfs: make do_rmdir retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 19190618695f..fe06a2fd1925 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3339,8 +3339,9 @@ static long do_rmdir(int dfd, const char __user *pathname) struct filename *name; struct dentry *dentry; struct nameidata nd; - - name = user_path_parent(dfd, pathname, &nd, 0); + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); @@ -3382,6 +3383,10 @@ exit2: exit1: path_put(&nd.path); putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 5d18f8133cad85ccbb7fa6fd351d75025da32504 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 16:38:04 -0500 Subject: vfs: make do_unlinkat retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fe06a2fd1925..8a262c2efff8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3440,8 +3440,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; - - name = user_path_parent(dfd, pathname, &nd, 0); + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); @@ -3479,6 +3480,11 @@ exit2: exit1: path_put(&nd.path); putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + inode = NULL; + goto retry; + } return error; slashes: -- cgit v1.2.1 From c6a9428401c00a27d3c17264934d14e284570c97 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:10 -0500 Subject: vfs: fix renameat to retry on ESTALE errors ...as always, rename is the messiest of the bunch. We have to track whether to retry or not via a separate flag since the error handling is already quite complex. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 8a262c2efff8..43a97ee1d4c8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3840,15 +3840,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, struct nameidata oldnd, newnd; struct filename *from; struct filename *to; + unsigned int lookup_flags = 0; + bool should_retry = false; int error; - - from = user_path_parent(olddfd, oldname, &oldnd, 0); +retry: + from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, &newnd, 0); + to = user_path_parent(newdfd, newname, &newnd, lookup_flags); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; @@ -3920,11 +3922,18 @@ exit3: unlock_rename(new_dir, old_dir); mnt_drop_write(oldnd.path.mnt); exit2: + if (retry_estale(error, lookup_flags)) + should_retry = true; path_put(&newnd.path); putname(to); exit1: path_put(&oldnd.path); putname(from); + if (should_retry) { + should_retry = false; + lookup_flags |= LOOKUP_REVAL; + goto retry; + } exit: return error; } -- cgit v1.2.1 From 48f7530d3f722617aa7cfea62b09b0c1a8d0173e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:11 -0500 Subject: vfs: have do_sys_truncate retry once on an ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index c819bbdab47f..07449b911a4d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -115,17 +115,23 @@ EXPORT_SYMBOL_GPL(vfs_truncate); static long do_sys_truncate(const char __user *pathname, loff_t length) { + unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; - error = user_path(pathname, &path); +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 87fa55952b7347175c6e2f03874869ad2c055adb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:11 -0500 Subject: vfs: have faccessat retry once on an ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 07449b911a4d..a994ccf39b40 100644 --- a/fs/open.c +++ b/fs/open.c @@ -316,6 +316,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) struct path path; struct inode *inode; int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; @@ -338,8 +339,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) } old_cred = override_creds(override_cred); - - res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); +retry: + res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; @@ -374,6 +375,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) out_path_release: path_put(&path); + if (retry_estale(res, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: revert_creds(old_cred); put_cred(override_cred); -- cgit v1.2.1 From 0291c0a551d5b0856627f2cb294da05f122414a0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:12 -0500 Subject: vfs: have chdir retry lookup and call once on ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index a994ccf39b40..402dfcb6720b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -394,8 +394,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; - - error = user_path_dir(filename, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; @@ -407,6 +408,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) dput_and_out: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -- cgit v1.2.1 From 2771261ec5b677a38f0cd5fcfc6cefd5393787ef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Dec 2012 17:08:32 -0500 Subject: vfs: have chroot retry once on ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 402dfcb6720b..a13a54d3e691 100644 --- a/fs/open.c +++ b/fs/open.c @@ -445,8 +445,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; - - error = user_path_dir(filename, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; @@ -465,6 +466,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) error = 0; dput_and_out: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -- cgit v1.2.1 From 14ff690c0f94cf2e37f7c448f4f09bf0b4006d62 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:13 -0500 Subject: vfs: make fchmodat retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index a13a54d3e691..99c3ce5f897b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -514,11 +514,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode { struct path path; int error; - - error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } } return error; } -- cgit v1.2.1 From 99a5df37a03c99e57d0da4f847a515b658963fbb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:13 -0500 Subject: vfs: make fchownat retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/open.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 99c3ce5f897b..9b33c0cbfacf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -582,6 +582,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; +retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; @@ -592,6 +593,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, mnt_drop_write(path.mnt); out_release: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -- cgit v1.2.1 From 96948fc6069b68380abac2944b8b02b43a2e2057 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:14 -0500 Subject: vfs: fix user_statfs to retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/statfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/statfs.c b/fs/statfs.c index f8e832e6f0a2..c219e733f553 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; - int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + int error; + unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } } return error; } -- cgit v1.2.1 From a69201d6f08240f20a0d33a1b7273d1e7748791c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:14 -0500 Subject: vfs: allow utimensat() calls to retry once on an ESTALE error Clearly, we can't handle the NULL filename case, but we can deal with the case where there's a real pathname. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/utimes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/utimes.c b/fs/utimes.c index bb0696a41735..f4fb7eca10e8 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times, if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - +retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = utimes_common(&path, times); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } } out: -- cgit v1.2.1 From 68f1bb8bb89e0bb813c893a42373a26ebdab7f9c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:15 -0500 Subject: vfs: allow setxattr to retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index e21c119f4f99..c5e90d2d751f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname, { struct path path; int error; - - error = user_path(pathname, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); @@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname, mnt_drop_write(path.mnt); } path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 49e09e1cc576daa99ea22f3a3c699062c34f8c0f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:15 -0500 Subject: vfs: allow lsetxattr() to retry once on ESTALE errors Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index c5e90d2d751f..74d36e063c8d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -394,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, { struct path path; int error; - - error = user_lpath(pathname, &path); + unsigned int lookup_flags = 0; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); @@ -404,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, mnt_drop_write(path.mnt); } path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 60e66b48ca20819d97ec4851007accc0b013985a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:16 -0500 Subject: vfs: make getxattr retry once on an ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 74d36e063c8d..24833de649f8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -486,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname, { struct path path; ssize_t error; - - error = user_path(pathname, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = getxattr(path.dentry, name, value, size); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 3a3e159dbfe405517584b09bbcefd72115d93342 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:16 -0500 Subject: vfs: make lgetxattr retry once on ESTALE Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 24833de649f8..c127d57bf655 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -505,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, { struct path path; ssize_t error; - - error = user_lpath(pathname, &path); + unsigned int lookup_flags = 0; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = getxattr(path.dentry, name, value, size); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 10a90cf36efe0fca5c7719fd9b0299abd6be51aa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:16 -0500 Subject: vfs: make listxattr retry once on ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index c127d57bf655..1dc1eac17319 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -576,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, { struct path path; ssize_t error; - - error = user_path(pathname, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From bd9bbc9842bde1b14046cdbda1153f0d49061135 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:17 -0500 Subject: vfs: make llistxattr retry once on ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 1dc1eac17319..49d09e158809 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -595,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, { struct path path; ssize_t error; - - error = user_lpath(pathname, &path); + unsigned int lookup_flags = 0; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From 12f06212990400db1b54bd7b49b7f6f5f1b32469 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:17 -0500 Subject: vfs: make removexattr retry once on ESTALE Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 49d09e158809..4caa8efeada3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -645,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname, { struct path path; int error; - - error = user_path(pathname, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); @@ -655,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname, mnt_drop_write(path.mnt); } path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From b729d75d19777a5dd34672020516eada43ff026f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Dec 2012 12:10:18 -0500 Subject: vfs: make lremovexattr retry once on ESTALE error Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/xattr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 4caa8efeada3..3377dff18404 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -668,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, { struct path path; int error; - - error = user_lpath(pathname, &path); + unsigned int lookup_flags = 0; +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); @@ -678,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, mnt_drop_write(path.mnt); } path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } -- cgit v1.2.1 From b66c5984017533316fd1951770302649baf1aa33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 20 Dec 2012 15:05:16 -0800 Subject: exec: do not leave bprm->interp on stack If a series of scripts are executed, each triggering module loading via unprintable bytes in the script header, kernel stack contents can leak into the command line. Normally execution of binfmt_script and binfmt_misc happens recursively. However, when modules are enabled, and unprintable bytes exist in the bprm->buf, execution will restart after attempting to load matching binfmt modules. Unfortunately, the logic in binfmt_script and binfmt_misc does not expect to get restarted. They leave bprm->interp pointing to their local stack. This means on restart bprm->interp is left pointing into unused stack memory which can then be copied into the userspace argv areas. After additional study, it seems that both recursion and restart remains the desirable way to handle exec with scripts, misc, and modules. As such, we need to protect the changes to interp. This changes the logic to require allocation for any changes to the bprm->interp. To avoid adding a new kmalloc to every exec, the default value is left as-is. Only when passing through binfmt_script or binfmt_misc does an allocation take place. For a proof of concept, see DoTest.sh from: http://www.halfdog.net/Security/2012/LinuxKernelBinfmtScriptStackDataDisclosure/ Signed-off-by: Kees Cook Cc: halfdog Cc: P J P Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 5 ++++- fs/binfmt_script.c | 4 +++- fs/exec.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9be335fb8a7c..0c8869fdd14e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -172,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm) goto _error; bprm->argc ++; - bprm->interp = iname; /* for binfmt_script */ + /* Update interp in case binfmt_script needs it. */ + retval = bprm_change_interp(iname, bprm); + if (retval < 0) + goto _error; interp_file = open_exec (iname); retval = PTR_ERR (interp_file); diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 1610a91637e5..5027a3e14922 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -80,7 +80,9 @@ static int load_script(struct linux_binprm *bprm) retval = copy_strings_kernel(1, &i_name, bprm); if (retval) return retval; bprm->argc++; - bprm->interp = interp; + retval = bprm_change_interp(interp, bprm); + if (retval < 0) + return retval; /* * OK, now restart the process with the interpreter's dentry. diff --git a/fs/exec.c b/fs/exec.c index d8e1191cb112..237d5342786c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1175,9 +1175,24 @@ void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } + /* If a binfmt changed the interp, free it. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); kfree(bprm); } +int bprm_change_interp(char *interp, struct linux_binprm *bprm) +{ + /* If a binfmt changed the interp, free it first. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); + bprm->interp = kstrdup(interp, GFP_KERNEL); + if (!bprm->interp) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(bprm_change_interp); + /* * install the new credentials for this executable */ -- cgit v1.2.1 From 5daa669c80c121ab75ecdf1c8e2df52f072fd25e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 20 Dec 2012 15:05:24 -0800 Subject: hfsplus: avoid crash on failed block map free If the read fails we kmap an error code. This doesn't end well. Instead print a critical error and pray. This mirrors the rest of the fs behaviour with critical error cases. Acked-by: Vyacheslav Dubeyko Signed-off-by: Alan Cox Signed-off-by: Vyacheslav Dubeyko Cc: Al Viro Cc: Christoph Hellwig Cc: Jan Kara Acked-by: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/bitmap.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 4cfbe2edd296..6feefc0cb48a 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -176,12 +176,14 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) - return -2; + return -ENOENT; mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; pnr = offset / PAGE_CACHE_BITS; page = read_mapping_page(mapping, pnr, NULL); + if (IS_ERR(page)) + goto kaboom; pptr = kmap(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; @@ -214,6 +216,8 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) set_page_dirty(page); kunmap(page); page = read_mapping_page(mapping, ++pnr, NULL); + if (IS_ERR(page)) + goto kaboom; pptr = kmap(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; @@ -232,4 +236,11 @@ out: mutex_unlock(&sbi->alloc_mutex); return 0; + +kaboom: + printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n", + PTR_ERR(page)); + mutex_unlock(&sbi->alloc_mutex); + + return -EIO; } -- cgit v1.2.1 From 1b243fd39bd605cdfc482bba4e56b0cb34b28f27 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Thu, 20 Dec 2012 15:05:25 -0800 Subject: hfsplus: rework processing errors in hfsplus_free_extents() Currently, it doesn't process error codes from the hfsplus_block_free() call in hfsplus_free_extents() method. Add some error code processing. Signed-off-by: Vyacheslav Dubeyko Cc: Christoph Hellwig Cc: Al Viro Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 5849e3ef35cc..eba76eab6d62 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -329,6 +329,7 @@ static int hfsplus_free_extents(struct super_block *sb, { u32 count, start; int i; + int err = 0; hfsplus_dump_extent(extent); for (i = 0; i < 8; extent++, i++) { @@ -345,18 +346,33 @@ found: for (;;) { start = be32_to_cpu(extent->start_block); if (count <= block_nr) { - hfsplus_block_free(sb, start, count); + err = hfsplus_block_free(sb, start, count); + if (err) { + printk(KERN_ERR "hfs: can't free extent\n"); + dprint(DBG_EXTENT, " start: %u count: %u\n", + start, count); + } extent->block_count = 0; extent->start_block = 0; block_nr -= count; } else { count -= block_nr; - hfsplus_block_free(sb, start + count, block_nr); + err = hfsplus_block_free(sb, start + count, block_nr); + if (err) { + printk(KERN_ERR "hfs: can't free extent\n"); + dprint(DBG_EXTENT, " start: %u count: %u\n", + start, count); + } extent->block_count = cpu_to_be32(count); block_nr = 0; } - if (!block_nr || !i) - return 0; + if (!block_nr || !i) { + /* + * Try to free all extents and + * return only last error + */ + return err; + } i--; extent--; count = be32_to_cpu(extent->block_count); -- cgit v1.2.1 From 81cc7fad552bc9e4fa8c1f25becbecaaa1d41b67 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Thu, 20 Dec 2012 15:05:28 -0800 Subject: hfsplus: rework processing of hfs_btree_write() returned error Add to hfs_btree_write() a return of -EIO on failure of b-tree node searching. Also add logic ofor processing errors from hfs_btree_write() in hfsplus_system_write_inode() with a message about b-tree writing failure. [akpm@linux-foundation.org: reduce scope of `err', print errno on error] Signed-off-by: Vyacheslav Dubeyko Cc: Christoph Hellwig Cc: Al Viro Acked-by: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/btree.c | 5 +++-- fs/hfsplus/hfsplus_fs.h | 2 +- fs/hfsplus/super.c | 10 ++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 21023d9f8ff3..685d07d0ed18 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -159,7 +159,7 @@ void hfs_btree_close(struct hfs_btree *tree) kfree(tree); } -void hfs_btree_write(struct hfs_btree *tree) +int hfs_btree_write(struct hfs_btree *tree) { struct hfs_btree_header_rec *head; struct hfs_bnode *node; @@ -168,7 +168,7 @@ void hfs_btree_write(struct hfs_btree *tree) node = hfs_bnode_find(tree, 0); if (IS_ERR(node)) /* panic? */ - return; + return -EIO; /* Load the header */ page = node->page[0]; head = (struct hfs_btree_header_rec *)(kmap(page) + @@ -186,6 +186,7 @@ void hfs_btree_write(struct hfs_btree *tree) kunmap(page); set_page_dirty(page); hfs_bnode_put(node); + return 0; } static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index c571de224b15..a6da86b1b4c1 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -335,7 +335,7 @@ int hfsplus_block_free(struct super_block *, u32, u32); /* btree.c */ struct hfs_btree *hfs_btree_open(struct super_block *, u32); void hfs_btree_close(struct hfs_btree *); -void hfs_btree_write(struct hfs_btree *); +int hfs_btree_write(struct hfs_btree *); struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *); void hfs_bmap_free(struct hfs_bnode *); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 811a84d2d964..2036f585b094 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -127,8 +127,14 @@ static int hfsplus_system_write_inode(struct inode *inode) hfsplus_mark_mdb_dirty(inode->i_sb); } hfsplus_inode_write_fork(inode, fork); - if (tree) - hfs_btree_write(tree); + if (tree) { + int err = hfs_btree_write(tree); + if (err) { + printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n", + err, inode->i_ino); + return err; + } + } return 0; } -- cgit v1.2.1 From bffdd661bd424ea4298639805bfcbcaf8ffb62f2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Thu, 20 Dec 2012 15:05:29 -0800 Subject: hfsplus: add error message for the case of failure of sync fs in delayed_sync_fs() method Add an error message for the case of failure of sync fs in delayed_sync_fs() method. Signed-off-by: Vyacheslav Dubeyko Cc: Christoph Hellwig Cc: Al Viro Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 2036f585b094..796198d26553 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -232,6 +232,7 @@ out: static void delayed_sync_fs(struct work_struct *work) { + int err; struct hfsplus_sb_info *sbi; sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); @@ -240,7 +241,9 @@ static void delayed_sync_fs(struct work_struct *work) sbi->work_queued = 0; spin_unlock(&sbi->work_lock); - hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); + err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); + if (err) + printk(KERN_ERR "hfs: delayed sync fs err %d\n", err); } void hfsplus_mark_mdb_dirty(struct super_block *sb) -- cgit v1.2.1 From ee297209bf0a25c6717b7c063e76795142d32f37 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 20 Dec 2012 15:05:44 -0800 Subject: proc: fix inconsistent lock state Lockdep found an inconsistent lock state when rcu is processing delayed work in softirq. Currently, kernel is using spin_lock/spin_unlock to protect proc_inum_ida, but proc_free_inum is called by rcu in softirq context. Use spin_lock_bh/spin_unlock_bh fix following lockdep warning. ================================= [ INFO: inconsistent lock state ] 3.7.0 #36 Not tainted --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (proc_inum_lock){+.?...}, at: proc_free_inum+0x1c/0x50 {SOFTIRQ-ON-W} state was registered at: __lock_acquire+0x8ae/0xca0 lock_acquire+0x199/0x200 _raw_spin_lock+0x41/0x50 proc_alloc_inum+0x4c/0xd0 alloc_mnt_ns+0x49/0xc0 create_mnt_ns+0x25/0x70 mnt_init+0x161/0x1c7 vfs_caches_init+0x107/0x11a start_kernel+0x348/0x38c x86_64_start_reservations+0x131/0x136 x86_64_start_kernel+0x103/0x112 irq event stamp: 2993422 hardirqs last enabled at (2993422): _raw_spin_unlock_irqrestore+0x55/0x80 hardirqs last disabled at (2993421): _raw_spin_lock_irqsave+0x29/0x70 softirqs last enabled at (2993394): _local_bh_enable+0x13/0x20 softirqs last disabled at (2993395): call_softirq+0x1c/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(proc_inum_lock); lock(proc_inum_lock); *** DEADLOCK *** no locks held by swapper/1/0. stack backtrace: Pid: 0, comm: swapper/1 Not tainted 3.7.0 #36 Call Trace: [] ? vprintk_emit+0x471/0x510 print_usage_bug+0x2a5/0x2c0 mark_lock+0x33b/0x5e0 __lock_acquire+0x813/0xca0 lock_acquire+0x199/0x200 _raw_spin_lock+0x41/0x50 proc_free_inum+0x1c/0x50 free_pid_ns+0x1c/0x50 put_pid_ns+0x2e/0x50 put_pid+0x4a/0x60 delayed_put_pid+0x12/0x20 rcu_process_callbacks+0x462/0x790 __do_softirq+0x1b4/0x3b0 call_softirq+0x1c/0x30 do_softirq+0x59/0xd0 irq_exit+0x54/0xd0 smp_apic_timer_interrupt+0x95/0xa3 apic_timer_interrupt+0x72/0x80 cpuidle_enter_tk+0x10/0x20 cpuidle_enter_state+0x17/0x50 cpuidle_idle_call+0x287/0x520 cpu_idle+0xba/0x130 start_secondary+0x2b3/0x2bc Signed-off-by: Xiaotian Feng Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7b3ae3cc0ef9..e659a0ff1da7 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -359,18 +359,18 @@ retry: if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); return -ENOSPC; } *inum = PROC_DYNAMIC_FIRST + i; @@ -379,9 +379,9 @@ retry: void proc_free_inum(unsigned int inum) { - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -- cgit v1.2.1 From c39540c6d1add1d0ad843b3d2437311924193359 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Thu, 20 Dec 2012 15:05:46 -0800 Subject: fat: fix incorrect function comment fat_search_long() returns 0 on success, -ENOENT/ENOMEM on failure. Change the function comment accordingly. While at it, fix some trivial typos. Signed-off-by: Ravishankar N Signed-off-by: Namjae Jeon Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 5 ++--- fs/fat/inode.c | 2 +- fs/fat/misc.c | 4 ++++ 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 2a182342442e..58bf744dbf39 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -461,8 +461,7 @@ static int fat_parse_short(struct super_block *sb, } /* - * Return values: negative -> error, 0 -> not found, positive -> found, - * value is the total amount of slots, including the shortname entry. + * Return values: negative -> error/not found, 0 -> found. */ int fat_search_long(struct inode *inode, const unsigned char *name, int name_len, struct fat_slot_info *sinfo) @@ -1255,7 +1254,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, sinfo->nr_slots = nr_slots; - /* First stage: search free direcotry entries */ + /* First stage: search free directory entries */ free_slots = nr_bhs = 0; bh = prev = NULL; pos = 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 35806813ea4e..f8f491677a4a 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1344,7 +1344,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dir_entries = get_unaligned_le16(&b->dir_entries); if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) - fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" + fat_msg(sb, KERN_ERR, "bogus directory-entries per block" " (%u)", sbi->dir_entries); brelse(bh); goto out_invalid; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 5eb600dc43a9..359d307b5507 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -135,6 +135,10 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) } if (ret < 0) return ret; + /* + * FIXME:Although we can add this cache, fat_cache_add() is + * assuming to be called after linear search with fat_cache_id. + */ // fat_cache_add(inode, new_fclus, new_dclus); } else { MSDOS_I(inode)->i_start = new_dclus; -- cgit v1.2.1 From a68c2f12b4b28994aaf622bbe5724b7258cc2fcf Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 20 Dec 2012 15:05:52 -0800 Subject: sendfile: allows bypassing of notifier events do_sendfile() in fs/read_write.c does not call the fsnotify functions, unlike its neighbors. This manifests as a lack of inotify ACCESS events when a file is sent using sendfile(2). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=12812 [akpm@linux-foundation.org: use fsnotify_modify(out.file), not fsnotify_access(), per Dave] Signed-off-by: Alan Cox Cc: Dave Chinner Cc: Jens Axboe Cc: Scott Wolchok Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/read_write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 1edaf099ddd7..bb34af315280 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -935,6 +935,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); + fsnotify_access(in.file); + fsnotify_modify(out.file); } inc_syscr(current); -- cgit v1.2.1 From d7961c7fa4d2e3c3f12be67e21ba8799b5a7238a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 21 Dec 2012 00:15:51 -0500 Subject: jbd2: fix assertion failure in jbd2_journal_flush() The following race is possible between start_this_handle() and someone calling jbd2_journal_flush(). Process A Process B start_this_handle(). if (journal->j_barrier_count) # false if (!journal->j_running_transaction) { #true read_unlock(&journal->j_state_lock); jbd2_journal_lock_updates() jbd2_journal_flush() write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { # false ... wait for committing trans ... write_unlock(&journal->j_state_lock); ... write_lock(&journal->j_state_lock); if (!journal->j_running_transaction) { # true jbd2_get_transaction(journal, new_transaction); write_unlock(&journal->j_state_lock); goto repeat; # eventually blocks on j_barrier_count > 0 ... J_ASSERT(!journal->j_running_transaction); # fails We fix the race by rechecking j_barrier_count after reacquiring j_state_lock in exclusive mode. Reported-by: yjwsignal@empal.com Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index deffd945c8e2..cd4485db42b3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -209,7 +209,8 @@ repeat: if (!new_transaction) goto alloc_transaction; write_lock(&journal->j_state_lock); - if (!journal->j_running_transaction) { + if (!journal->j_running_transaction && + !journal->j_barrier_count) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } -- cgit v1.2.1 From c129c29347b6cf0d64bfe53848f68320286612ab Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Dec 2012 12:15:05 +0000 Subject: NFS: Provide stub nfs_fscache_wait_on_invalidate() for when CONFIG_NFS_FSCACHE=n Provide a stub nfs_fscache_wait_on_invalidate() function for when CONFIG_NFS_FSCACHE=n lest the following error appear: fs/nfs/inode.c: In function 'nfs_invalidate_mapping': fs/nfs/inode.c:887:2: error: implicit declaration of function 'nfs_fscache_wait_on_invalidate' [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Reported-by: kbuild test robot Reported-by: Vineet Gupta Reported-by: Borislav Petkov Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/nfs/fscache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 277b02782897..4ecb76652eba 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -222,6 +222,7 @@ static inline void nfs_readpage_to_fscache(struct inode *inode, static inline void nfs_fscache_invalidate(struct inode *inode) {} +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { -- cgit v1.2.1 From c4271c6e37c32105492cbbed35f45330cb327b94 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 21 Dec 2012 11:02:32 -0500 Subject: NFS: Kill fscache warnings when mounting without -ofsc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fscache code will currently bleat a "non-unique superblock keys" warning even if the user is mounting without the 'fsc' option. There should be no reason to even initialise the superblock cache cookie unless we're planning on using fscache for something, so ensure that we check for the NFS_OPTION_FSCACHE flag before calling into the fscache code. Reported-by: PaweÅ‚ Sikora Signed-off-by: Trond Myklebust Cc: David Howells Acked-by: David Howells Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aa5315bb3666..c25cadf8f8c4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2375,19 +2375,30 @@ static void nfs_get_cache_cookie(struct super_block *sb, struct nfs_parsed_mount_data *parsed, struct nfs_clone_mount *cloned) { + struct nfs_server *nfss = NFS_SB(sb); char *uniq = NULL; int ulen = 0; - if (parsed && parsed->fscache_uniq) { - uniq = parsed->fscache_uniq; - ulen = strlen(parsed->fscache_uniq); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + + if (parsed) { + if (!(parsed->options & NFS_OPTION_FSCACHE)) + return; + if (parsed->fscache_uniq) { + uniq = parsed->fscache_uniq; + ulen = strlen(parsed->fscache_uniq); + } } else if (cloned) { struct nfs_server *mnt_s = NFS_SB(cloned->sb); + if (!(mnt_s->options & NFS_OPTION_FSCACHE)) + return; if (mnt_s->fscache_key) { uniq = mnt_s->fscache_key->key.uniquifier; ulen = mnt_s->fscache_key->key.uniq_len; }; - } + } else + return; nfs_fscache_get_super_cookie(sb, uniq, ulen); } -- cgit v1.2.1 From 10532b560bacf23766f9c7dc09778b31b198ff45 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 21 Dec 2012 19:48:59 -0500 Subject: Revert "nfsd: warn on odd reply state in nfsd_vfs_read" This reverts commit 79f77bf9a4e3dd5ead006b8f17e7c4ff07d8374e. This is obviously wrong, and I have no idea how I missed seeing the warning in testing: I must just not have looked at the right logs. The caller bumps rq_resused/rq_next_page, so it will always be hit on a large enough read. Reported-by: Dave Jones Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f0a6d88d7fff..d586117fa94a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -934,7 +934,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, .u.data = rqstp, }; - WARN_ON_ONCE(rqstp->rq_next_page != rqstp->rq_respages + 1); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { -- cgit v1.2.1 From 4520fb3c3690f2643006d85f09ecb74554c10e95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Dec 2012 13:28:54 -0500 Subject: ext4: split off ext4_journalled_invalidatepage() In data=journal mode we don't need delalloc or DIO handling in invalidatepage and similarly in other modes we don't need the journal handling. So split invalidatepage implementations. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb1c1ab2720b..12d3fbcff59f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2880,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs static void ext4_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_invalidatepage(page, offset); /* @@ -2889,16 +2887,27 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) */ if (ext4_should_dioread_nolock(page->mapping->host)) ext4_invalidatepage_free_endio(page, offset); + + /* No journalling happens on data buffers when this function is used */ + WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + + block_invalidatepage(page, offset); +} + +static void ext4_journalled_invalidatepage(struct page *page, + unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_journalled_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) ClearPageChecked(page); - if (journal) - jbd2_journal_invalidatepage(journal, page, offset); - else - block_invalidatepage(page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3264,7 +3273,7 @@ static const struct address_space_operations ext4_journalled_aops = { .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, + .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, -- cgit v1.2.1 From 53e872681fed6a43047e71bf927f77d06f467988 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Dec 2012 13:29:52 -0500 Subject: ext4: fix deadlock in journal_unmap_buffer() We cannot wait for transaction commit in journal_unmap_buffer() because we hold page lock which ranks below transaction start. We solve the issue by bailing out of journal_unmap_buffer() and jbd2_journal_invalidatepage() with -EBUSY. Caller is then responsible for waiting for transaction commit to finish and try invalidation again. Since the issue can happen only for page stradding i_size, it is simple enough to manually call jbd2_journal_invalidatepage() for such page from ext4_setattr(), check the return value and wait if necessary. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++------- fs/jbd2/transaction.c | 27 +++++++++-------- 2 files changed, 85 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 12d3fbcff59f..cbfe13bf5b2a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2894,8 +2894,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) block_invalidatepage(page, offset); } -static void ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) +static int __ext4_journalled_invalidatepage(struct page *page, + unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); @@ -2907,7 +2907,14 @@ static void ext4_journalled_invalidatepage(struct page *page, if (offset == 0) ClearPageChecked(page); - jbd2_journal_invalidatepage(journal, page, offset); + return jbd2_journal_invalidatepage(journal, page, offset); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidatepage(struct page *page, + unsigned long offset) +{ + WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -4313,6 +4320,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +/* + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate + * buffers that are attached to a page stradding i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ + struct page *page; + unsigned offset; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = 0; + int ret; + + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* + * All buffers in the last page remain valid? Then there's nothing to + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == + * blocksize case + */ + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) + return; + while (1) { + page = find_lock_page(inode->i_mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + ret = __ext4_journalled_invalidatepage(page, offset); + unlock_page(page); + page_cache_release(page); + if (ret != -EBUSY) + return; + commit_tid = 0; + read_lock(&journal->j_state_lock); + if (journal->j_committing_transaction) + commit_tid = journal->j_committing_transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (commit_tid) + jbd2_log_wait_commit(journal, commit_tid); + } +} + /* * ext4_setattr() * @@ -4426,16 +4474,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - /* Inode size will be reduced, wait for dio in flight. - * Temporarily disable dioread_nolock to prevent - * livelock. */ + if (attr->ia_size != inode->i_size) { + loff_t oldsize = inode->i_size; + + i_size_write(inode, attr->ia_size); + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ if (orphan) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, oldsize, inode->i_size); } ext4_truncate(inode); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cd4485db42b3..ddc51a7f4508 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1840,7 +1840,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, BUFFER_TRACE(bh, "entry"); -retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1935,14 +1934,11 @@ retry: * for commit and try again. */ if (partial_page) { - tid_t tid = journal->j_committing_transaction->t_tid; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); - jbd2_log_wait_commit(journal, tid); - goto retry; + return -EBUSY; } /* * OK, buffer won't be reachable after truncate. We just set @@ -2003,21 +1999,23 @@ zap_buffer_unlocked: * @page: page to flush * @offset: length of page to invalidate. * - * Reap page buffers containing data after offset in page. - * + * Reap page buffers containing data after offset in page. Can return -EBUSY + * if buffers are part of the committing transaction and the page is straddling + * i_size. Caller then has to wait for current commit and try again. */ -void jbd2_journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned long offset) +int jbd2_journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned long offset) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; int may_free = 1; + int ret = 0; if (!PageLocked(page)) BUG(); if (!page_has_buffers(page)) - return; + return 0; /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be @@ -2031,9 +2029,11 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh, - offset > 0); + ret = journal_unmap_buffer(journal, bh, offset > 0); unlock_buffer(bh); + if (ret < 0) + return ret; + may_free &= ret; } curr_off = next_off; bh = next; @@ -2044,6 +2044,7 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } + return 0; } /* -- cgit v1.2.1 From ad96f7115593e962dd22a0519021eafaba56f5e3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 25 Dec 2012 13:31:52 -0500 Subject: ext4: fix an incorrect comment about i_mutex i_mutex is not held when ->sync_file is called. Reviewed-by: Jan Kara Signed-off-by: Andy Lutomirski Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index dfbc1fe96674..3278e64e57b6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync) * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. - * - * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) -- cgit v1.2.1 From a28a9178e8fcd9b94f7333184ce78e816c8cb2af Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 25 Dec 2012 13:33:13 -0500 Subject: ext4: remove unaligned AIO warning printk Although I put this in, I now think it was a bad decision. For most users, there is very little to be done in this case. They get the message, once per day, with no real context or proposed action. TBH, it generates support calls when it probably does not need to; the message sounds more dire than the situation really is. Just nuke it. Normal investigation via blktrace or whatnot can reveal poor IO patterns if bad performance is encountered. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b64a60bf105a..1c0aad7db1ec 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, /* Unaligned direct AIO must be serialized; see comment above */ if (unaligned_aio) { - static unsigned long unaligned_warn_time; - - /* Warn about this once per day */ - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) - ext4_msg(inode->i_sb, KERN_WARNING, - "Unaligned AIO/DIO on inode %ld by %s; " - "performance will be poor.", - inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); ext4_unwritten_wait(inode); } -- cgit v1.2.1 From 0875a2b448fcaba67010850cf9649293a5ef653d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 25 Dec 2012 13:56:01 -0500 Subject: ext4: include journal blocks in df overhead calcs To more accurately calculate overhead for "bsd" style df reporting, we should count the journal blocks as overhead as well. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Tested-by: Eric Whitney --- fs/ext4/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e09f7d1646ba..4969167ac267 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3231,6 +3231,10 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } + /* Add the journal blocks as well */ + if (sbi->s_journal) + overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen); + sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); -- cgit v1.2.1 From d096ad0f79a782935d2e06ae8fb235e8c5397775 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Tue, 25 Dec 2012 14:08:16 -0500 Subject: ext4: do not try to write superblock on ro remount w/o journal When a journal-less ext4 filesystem is mounted on a read-only block device (blockdev --setro will do), each remount (for other, unrelated, flags, like suid=>nosuid etc) results in a series of scary messages from kernel telling about I/O errors on the device. This is becauese of the following code ext4_remount(): if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); at the end of remount procedure, which forces writing (flushing) of a superblock regardless whenever it is dirty or not, if the filesystem is readonly or not, and whenever the device itself is readonly or not. We only need call ext4_commit_super when the file system had been previously mounted read/write. Thanks to Eric Sandeen for help in diagnosing this issue. Signed-off-By: Michael Tokarev Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4969167ac267..183ae3447f64 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4729,7 +4729,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL) + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA -- cgit v1.2.1 From dfb2ea45becb198beeb75350d0b7b7ad9076a38f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Dec 2012 20:38:00 -0800 Subject: proc: Allow proc_free_inum to be called from any context While testing the pid namespace code I hit this nasty warning. [ 176.262617] ------------[ cut here ]------------ [ 176.263388] WARNING: at /home/eric/projects/linux/linux-userns-devel/kernel/softirq.c:160 local_bh_enable_ip+0x7a/0xa0() [ 176.265145] Hardware name: Bochs [ 176.265677] Modules linked in: [ 176.266341] Pid: 742, comm: bash Not tainted 3.7.0userns+ #18 [ 176.266564] Call Trace: [ 176.266564] [] warn_slowpath_common+0x7f/0xc0 [ 176.266564] [] warn_slowpath_null+0x1a/0x20 [ 176.266564] [] local_bh_enable_ip+0x7a/0xa0 [ 176.266564] [] _raw_spin_unlock_bh+0x19/0x20 [ 176.266564] [] proc_free_inum+0x3a/0x50 [ 176.266564] [] free_pid_ns+0x1c/0x80 [ 176.266564] [] put_pid_ns+0x35/0x50 [ 176.266564] [] put_pid+0x4a/0x60 [ 176.266564] [] tty_ioctl+0x717/0xc10 [ 176.266564] [] ? wait_consider_task+0x855/0xb90 [ 176.266564] [] ? default_spin_lock_flags+0x9/0x10 [ 176.266564] [] ? remove_wait_queue+0x5a/0x70 [ 176.266564] [] do_vfs_ioctl+0x98/0x550 [ 176.266564] [] ? recalc_sigpending+0x1f/0x60 [ 176.266564] [] ? __set_task_blocked+0x37/0x80 [ 176.266564] [] ? sys_wait4+0xab/0xf0 [ 176.266564] [] sys_ioctl+0x91/0xb0 [ 176.266564] [] ? task_stopped_code+0x50/0x50 [ 176.266564] [] system_call_fastpath+0x16/0x1b [ 176.266564] ---[ end trace 387af88219ad6143 ]--- It turns out that spin_unlock_bh(proc_inum_lock) is not safe when put_pid is called with another spinlock held and irqs disabled. For now take the easy path and use spin_lock_irqsave(proc_inum_lock) in proc_free_inum and spin_loc_irq in proc_alloc_inum(proc_inum_lock). Signed-off-by: "Eric W. Biederman" --- fs/proc/generic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e064f562b1f7..76ddae83daa5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -352,18 +352,18 @@ retry: if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); return -ENOSPC; } *inum = PROC_DYNAMIC_FIRST + i; @@ -372,9 +372,10 @@ retry: void proc_free_inum(unsigned int inum) { - spin_lock_bh(&proc_inum_lock); + unsigned long flags; + spin_lock_irqsave(&proc_inum_lock, flags); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irqrestore(&proc_inum_lock, flags); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -- cgit v1.2.1 From dfb7c0ceab57fee7618f4c9c31c5a89254e8530a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Dec 2012 09:47:39 +0900 Subject: f2fs: remove set_page_dirty for atomic f2fs_end_io_write We should guarantee not to do *scheduling while atomic*. I found, in atomic f2fs_end_io_write(), there is a set_page_dirty() call to deal with IO errors. But, set_page_dirty() calls: -> f2fs_set_data_page_dirty() -> set_dirty_dir_page() -> cond_resched() which results in scheduling. In order to avoid this, I'd like to remove simply set_page_dirty(), since the page is already marked as ERROR and f2fs will be operated as the read-only mode as well. So, there is no recovery issue with this. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b26e4ea1016..8bc1b6fdcf71 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -631,7 +631,6 @@ static void f2fs_end_io_write(struct bio *bio, int err) if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - set_page_dirty(page); } end_page_writeback(page); dec_page_count(p->sbi, F2FS_WRITEBACK); -- cgit v1.2.1 From 1362b5e347e27102ea0fa99c9932bca1ecde330f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Dec 2012 19:45:49 +0900 Subject: f2fs: fix wrong calculation on f_files in statfs In f2fs_statfs(), f_files should be the total number of available inodes instead of the currently allocated inodes. So, this patch should resolve the reported bug below. Note that, showing 10% usage is not a bug, since f2fs reveals whole volume size as much as possible and shows the space overhead as *used*. This policy is fair enough with respect to other file systems. (loop0 is backed by 1GiB file) $ mkfs.f2fs /dev/loop0 F2FS-tools: Ver: 1.1.0 (2012-12-11) Info: sector size = 512 Info: total sectors = 2097152 (in 512bytes) Info: zone aligned segment0 blkaddr: 512 Info: format successful $ mount /dev/loop0 mnt/ $ df mnt/ Filesystem 1K-blocks Used Available Use% Mounted on /dev/loop0 1046528 98312 929784 10% /home/zeta/linux-devel/mtd-bench/mnt $ df mnt/ -i Filesystem Inodes IUsed IFree IUse% Mounted on /dev/loop0 1 -465918 465919 - /home/zeta/linux-devel/mtd-bench/mnt Notice IUsed is negative. Also, 10% usage on a fresh f2fs seems too much to be correct. Reported-and-Tested-by: Ezequiel Garcia Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 13867322cf5a..f4d9e03723db 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -148,8 +148,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = valid_inode_count(sbi); - buf->f_ffree = sbi->total_node_count - valid_node_count(sbi); + buf->f_files = sbi->total_node_count; + buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); buf->f_namelen = F2FS_MAX_NAME_LEN; buf->f_fsid.val[0] = (u32)id; -- cgit v1.2.1 From 38e0abdcfb5e69aa61a1e9b474d434afc1c177a9 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 13 Dec 2012 23:44:11 +0900 Subject: f2fs: fix up f2fs_get_parent issue to retrieve correct parent inode number Test Case: [NFS Client] ls -lR . [NFS Server] while [ 1 ] do echo 3 > /proc/sys/vm/drop_caches done Error on NFS Client: "No such file or directory" When cache is dropped at the server, it results in lookup failure at the NFS client due to non-connection with the parent. The default path is it initiates a lookup by calculating the hash value for the name, even though the hash values stored on the disk for "." and ".." is maintained as zero, which results in failure from find_in_block due to not matching HASH values. Fix up, by using the correct hashing values for these entries. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++-- fs/f2fs/hash.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b4e24f32b54e..e1f66df0f97d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -540,13 +540,13 @@ int f2fs_make_empty(struct inode *inode, struct inode *parent) de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); - de->hash_code = 0; + de->hash_code = f2fs_dentry_hash(".", 1); de->ino = cpu_to_le32(inode->i_ino); memcpy(dentry_blk->filename[0], ".", 1); set_de_type(de, inode); de = &dentry_blk->dentry[1]; - de->hash_code = 0; + de->hash_code = f2fs_dentry_hash("..", 2); de->name_len = cpu_to_le16(2); de->ino = cpu_to_le32(parent->i_ino); memcpy(dentry_blk->filename[1], "..", 2); diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index a60f04200f8b..5e48baca3597 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -76,6 +76,10 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len) const char *p; __u32 in[8], buf[4]; + if ((len <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) + return 0; + /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; buf[1] = 0xefcdab89; -- cgit v1.2.1 From 398b1ac5a57219823f942a8d3665b27ab99354de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Dec 2012 15:28:39 +0900 Subject: f2fs: fix handling errors got by f2fs_write_inode Ruslan reported that f2fs hangs with an infinite loop in f2fs_sync_file(): while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0) f2fs_write_inode(inode, NULL); The reason was revealed that the cold flag is not set even thought this inode is a normal file. Therefore, sync_node_pages() skips to write node blocks since it only writes cold node blocks. The cold flag is stored to the node_footer in node block, and whenever a new node page is allocated, it is set according to its file type, file or directory. But, after sudden-power-off, when recovering the inode page, f2fs doesn't recover its cold flag. So, let's assign the cold flag in more right places. One more thing: If f2fs_write_inode() returns an error due to whatever situations, there would be no dirty node pages so that sync_node_pages() returns zero. (i.e., zero means nothing was written.) Reported-by: Ruslan N. Marchenko Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 ++ fs/f2fs/file.c | 10 ++++++---- fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e1f66df0f97d..4a78d6c4f3a7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -11,6 +11,7 @@ #include #include #include "f2fs.h" +#include "node.h" #include "acl.h" static unsigned long dir_blocks(struct inode *inode) @@ -308,6 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry) ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); + set_cold_node(inode, ipage); init_dent_inode(dentry, ipage); f2fs_put_page(ipage, 1); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f9e085dfb1f0..7f9ea9271ebe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -160,15 +160,17 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (need_to_sync_dir(sbi, inode)) need_cp = true; - f2fs_write_inode(inode, NULL); - if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); clear_inode_flag(F2FS_I(inode), FI_NEED_CP); } else { - while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0) - f2fs_write_inode(inode, NULL); + /* if there is no written node page, write its inode page */ + while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + ret = f2fs_write_inode(inode, NULL); + if (ret) + goto out; + } filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index df5fb381ebf1..bf20b4d03214 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -203,6 +203,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + set_cold_node(inode, node_page); set_page_dirty(node_page); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19870361497e..dffac1c11f63 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -834,11 +834,11 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; } set_node_addr(sbi, &new_ni, NEW_ADDR); + set_cold_node(dn->inode, page); dn->node_page = page; sync_inode_page(dn); set_page_dirty(page); - set_cold_node(dn->inode, page); if (ofs == 0) inc_valid_inode_count(sbi); -- cgit v1.2.1 From 30f0c75858c46a0273ccb838de401b1f5fdebe6f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Dec 2012 16:09:19 +0900 Subject: f2fs: should recover orphan and fsync data The recovery routine should do all the time regardless of normal umount action. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f4d9e03723db..50240d28ca24 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -528,8 +528,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* if there are nt orphan nodes free them */ err = -EINVAL; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && - recover_orphan_inodes(sbi)) + if (recover_orphan_inodes(sbi)) goto free_node_inode; /* read root inode and dentry */ @@ -548,8 +547,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && - !test_opt(sbi, DISABLE_ROLL_FORWARD)) + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) recover_fsync_data(sbi); /* After POR, we can run background GC thread */ -- cgit v1.2.1 From 1efef832020ef392deb2cd3d74e0c316711245be Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Dec 2012 16:25:21 +0900 Subject: f2fs: do f2fs_balance_fs in front of dir operations In order to conserve free sections to deal with the worst-case scenarios, f2fs should be able to freeze all the directory operations especially when there are not enough free sections. The f2fs_balance_fs() is for this use. When FS utilization becomes almost 100%, directory operations can be failed due to -ENOSPC frequently, which produces some dirty node pages occasionally. Previously, in such a case, f2fs_balance_fs() is not able to be triggered since it is triggered only if the directory operation ends up with success. So, this patch triggers f2fs_balance_fs() at first before handling directory operations. Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 89b7675dc377..b42389f80011 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -123,6 +123,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -144,8 +146,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!sbi->por_doing) d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); return 0; out: clear_nlink(inode); @@ -163,6 +163,8 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + f2fs_balance_fs(sbi); + inode->i_ctime = CURRENT_TIME; atomic_inc(&inode->i_count); @@ -172,8 +174,6 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, goto out; d_instantiate(dentry, inode); - - f2fs_balance_fs(sbi); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); @@ -223,6 +223,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + f2fs_balance_fs(sbi); + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; @@ -238,7 +240,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); - f2fs_balance_fs(sbi); fail: return err; } @@ -252,6 +253,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, unsigned symlen = strlen(symname) + 1; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -268,9 +271,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); - return err; out: clear_nlink(inode); @@ -286,6 +286,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -305,7 +307,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) d_instantiate(dentry, inode); unlock_new_inode(inode); - f2fs_balance_fs(sbi); return 0; out_fail: @@ -336,6 +337,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -350,9 +353,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); - return 0; out: clear_nlink(inode); @@ -376,6 +376,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *new_entry; int err = -ENOENT; + f2fs_balance_fs(sbi); + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -441,8 +443,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } mutex_unlock_op(sbi, RENAME); - - f2fs_balance_fs(sbi); return 0; out_dir: -- cgit v1.2.1 From 48c6d1217e3dc743e7d3ad9b9def8d4810d13a85 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 22 Dec 2012 01:52:39 -0800 Subject: f2fs: Don't assign e_id in f2fs_acl_from_disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With user namespaces enabled building f2fs fails with: CC fs/f2fs/acl.o fs/f2fs/acl.c: In function ‘f2fs_acl_from_disk’: fs/f2fs/acl.c:85:21: error: ‘struct posix_acl_entry’ has no member named ‘e_id’ make[2]: *** [fs/f2fs/acl.o] Error 1 make[2]: Target `__build' not remade because of errors. e_id is a backwards compatibility field only used for file systems that haven't been converted to use kuids and kgids. When the posix acl tag field is neither ACL_USER nor ACL_GROUP assigning e_id is unnecessary. Remove the assignment so f2fs will build with user namespaces enabled. Cc: Namjae Jeon Cc: Amit Sahrawat Acked-by: Jaegeuk Kim Signed-off-by: "Eric W. Biederman" --- fs/f2fs/acl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fed74d193ffb..e95b94945d5f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -82,7 +82,6 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl->a_entries[i].e_id = ACL_UNDEFINED_ID; entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry_short)); break; -- cgit v1.2.1 From 721e3eba21e43532e438652dd8f1fcdfce3187e7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 27 Dec 2012 01:42:48 -0500 Subject: ext4: lock i_mutex when truncating orphan inodes Commit c278531d39 added a warning when ext4_flush_unwritten_io() is called without i_mutex being taken. It had previously not been taken during orphan cleanup since races weren't possible at that point in the mount process, but as a result of this c278531d39, we will now see a kernel WARN_ON in this case. Take the i_mutex in ext4_orphan_cleanup() to suppress this warning. Reported-by: Alexander Beregalov Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu Cc: stable@vger.kernel.org --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 183ae3447f64..3d4fb81bacd5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2220,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); + mutex_lock(&inode->i_mutex); ext4_truncate(inode); + mutex_unlock(&inode->i_mutex); nr_truncates++; } else { ext4_msg(sb, KERN_DEBUG, -- cgit v1.2.1 From 0e9a9a1ad619e7e987815d20262d36a2f95717ca Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 27 Dec 2012 01:42:50 -0500 Subject: ext4: avoid hang when mounting non-journal filesystems with orphan list When trying to mount a file system which does not contain a journal, but which does have a orphan list containing an inode which needs to be truncated, the mount call with hang forever in ext4_orphan_cleanup() because ext4_orphan_del() will return immediately without removing the inode from the orphan list, leading to an uninterruptible loop in kernel code which will busy out one of the CPU's on the system. This can be trivially reproduced by trying to mount the file system found in tests/f_orphan_extents_inode/image.gz from the e2fsprogs source tree. If a malicious user were to put this on a USB stick, and mount it on a Linux desktop which has automatic mounts enabled, this could be considered a potential denial of service attack. (Not a big deal in practice, but professional paranoids worry about such things, and have even been known to allocate CVE numbers for such problems.) Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cac448282331..8990165346ee 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2648,7 +2648,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - if (!EXT4_SB(inode->i_sb)->s_journal) + if ((!EXT4_SB(inode->i_sb)->s_journal) && + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); -- cgit v1.2.1 From 690e4a3ead5f88fc95f7650816d1376aa2e79db5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Dec 2012 22:19:30 +0100 Subject: f2fs: add missing #include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit m68k allmodconfig: fs/f2fs/data.c: In function ‘read_end_io’: fs/f2fs/data.c:311: error: implicit declaration of function ‘prefetchw’ fs/f2fs/segment.c: In function ‘f2fs_end_io_write’: fs/f2fs/segment.c:628: error: implicit declaration of function ‘prefetchw’ Signed-off-by: Geert Uytterhoeven Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + fs/f2fs/segment.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 655aeabc1dd4..3aa5ce7cab83 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8bc1b6fdcf71..ca7b5ffb09d5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "f2fs.h" -- cgit v1.2.1 From 71e9fec548a95b2a4cf378646addd5d3098684a2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Dec 2012 15:10:06 +0900 Subject: f2fs: invalidate the node page if allocation is failed The new_node_page() is processed as the following procedure. 1. A new node page is allocated. 2. Set PageUptodate with proper footer information. 3. Check if there is a free space for allocation 4.a. If there is no space, f2fs returns with -ENOSPC. 4.b. Otherwise, go next. In the case of step #4.a, f2fs remains a wrong node page in the page cache with the uptodate flag. Also, even though a new node page is allocated successfully, an error can be occurred afterwards due to allocation failure of the other data structures. In such a case, remove_inode_page() would be triggered, so that we have to clear uptodate flag in truncate_node() too. So, we should remove the uptodate flag, if allocation is failed. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dffac1c11f63..e85643cc74a9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -484,12 +484,14 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); + if (dn->inode->i_blocks == 0) { + BUG_ON(ni.blk_addr != NULL_ADDR); + goto invalidate; + } BUG_ON(ni.blk_addr == NULL_ADDR); - if (ni.blk_addr != NULL_ADDR) - invalidate_blocks(sbi, ni.blk_addr); - /* Deallocate node address */ + invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, 1); set_node_addr(sbi, &ni, NULL_ADDR); @@ -499,7 +501,7 @@ static void truncate_node(struct dnode_of_data *dn) } else { sync_inode_page(dn); } - +invalidate: clear_node_page_dirty(dn->node_page); F2FS_SET_SB_DIRT(sbi); @@ -768,20 +770,12 @@ int remove_inode_page(struct inode *inode) dn.inode_page_locked = 1; truncate_node(&dn); } - if (inode->i_blocks == 1) { - /* inernally call f2fs_put_page() */ - set_new_dnode(&dn, inode, page, page, ino); - truncate_node(&dn); - } else if (inode->i_blocks == 0) { - struct node_info ni; - get_node_info(sbi, inode->i_ino, &ni); - /* called after f2fs_new_inode() is failed */ - BUG_ON(ni.blk_addr != NULL_ADDR); - f2fs_put_page(page, 1); - } else { - BUG(); - } + /* 0 is possible, after f2fs_new_inode() is failed */ + BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); + set_new_dnode(&dn, inode, page, page, ino); + truncate_node(&dn); + mutex_unlock_op(sbi, NODE_TRUNC); return 0; } @@ -845,6 +839,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) return page; fail: + clear_node_page_dirty(page); f2fs_put_page(page, 1); return ERR_PTR(err); } -- cgit v1.2.1 From 12a67146e35ba1d04ac4a5430eaaa8790158d60e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Dec 2012 11:47:05 +0900 Subject: f2fs: return a default value for non-void function This patch resolves a build warning reported by kbuild test robot. " fs/f2fs/segment.c: In function '__get_segment_type': fs/f2fs/segment.c:806:1: warning: control reaches end of non-void function [-Wreturn-type] " Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++---- fs/f2fs/super.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ca7b5ffb09d5..fe2cc0bdc115 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -791,11 +791,10 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_2(page, p_type); case 4: return __get_segment_type_4(page, p_type); - case 6: - return __get_segment_type_6(page, p_type); - default: - BUG(); } + /* NR_CURSEG_TYPE(6) logs by default */ + BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); + return __get_segment_type_6(page, p_type); } static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 50240d28ca24..cf0ffb800654 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -302,7 +302,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != 6) + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; sbi->active_logs = arg; break; -- cgit v1.2.1 From 029cd28c1f739bbfc5105035696d5f1f4e45d161 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Dec 2012 17:20:21 +0900 Subject: f2fs: fix equation of has_not_enough_free_secs() Practically, has_not_enough_free_secs() should calculate with the numbers of current node and directory data blocks together. Actually the equation was implemented in need_to_flush(). So, this patch removes need_flush() and moves the equation into has_not_enough_free_secs(). Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 36 ++---------------------------------- fs/f2fs/segment.h | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fe2cc0bdc115..66f5e82ec324 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -19,48 +19,16 @@ #include "segment.h" #include "node.h" -static int need_to_flush(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * - sbi->segs_per_sec; - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - - if (sbi->por_doing) - return 0; - - if (free_sections(sbi) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))) - return 1; - return 0; -} - /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - - if (sbi->por_doing) - return; - /* - * We should do checkpoint when there are so many dirty node pages - * with enough free segments. After then, we should do GC. + * We should do GC or end up with checkpoint, if there are so many dirty + * dir/node pages without enough free segments. */ - if (need_to_flush(sbi)) { - sync_dirty_dir_inodes(sbi); - sync_node_pages(sbi, 0, &wbc); - } - if (has_not_enough_free_secs(sbi)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, 1); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0948405af6f5..66a288a52fd3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -459,7 +459,20 @@ static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi) { - return free_sections(sbi) <= reserved_sections(sbi); + unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * + sbi->segs_per_sec; + int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + + if (sbi->por_doing) + return false; + + if (free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi))) + return true; + return false; } static inline int utilization(struct f2fs_sb_info *sbi) -- cgit v1.2.1 From 06025f4df88e9e41f4ebcf6b4c3df30661332bc9 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 22 Dec 2012 12:09:43 +0900 Subject: f2fs: handle error from f2fs_iget_nowait In case f2fs_iget_nowait returns error, it results in truncate_hole being called with 'error' value as inode pointer. There is no check in truncate_hole for valid inode, so it could result in crash due "invalid access to memory". Avoid this by handling error condition properly. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b07e9b6ef376..207e2c865c7e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -228,6 +228,9 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Deallocate previous index in the node page */ inode = f2fs_iget_nowait(sbi->sb, ino); + if (IS_ERR(inode)) + return; + truncate_hole(inode, bidx, bidx + 1); iput(inode); } -- cgit v1.2.1 From 344324f10fad05e40b1047c5e09ebbc77e43c24f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 22 Dec 2012 12:09:58 +0900 Subject: f2fs: remove unneeded initialization of nr_dirty in dirty_seglist_info Since, the memory for the object of dirty_seglist_info is allocated using kzalloc - which returns zeroed out memory. So, there is no need to initialize the nr_dirty values with zeroes. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 66f5e82ec324..de6240922b0a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1575,7 +1575,6 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->nr_dirty[i] = 0; if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } -- cgit v1.2.1 From fd8bb65f796f041ee6ba400255ca9021bc45a992 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 22 Dec 2012 12:10:12 +0900 Subject: f2fs: fix fsync_inode list addition logic and avoid invalid access to memory In function find_fsync_dnodes() - the fsync inodes gets added to the list, but in one path suppose f2fs_iget results in error, in such case - error gets added to the fsync inode list. In next call to recover_data()->get_fsync_inode() entry = list_entry(this, struct fsync_inode_entry, list); if (entry->inode->i_ino == ino) This can result in "invalid access to memory" when it encounters 'error' as entry in the fsync inode list. So, add the fsync inode entry to the list only in case of no errors. And, free the object at that point itself in case of issue. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 207e2c865c7e..b571fee677d5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -144,14 +144,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); - list_add_tail(&entry->list, head); - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); + kmem_cache_free(fsync_entry_slab, entry); goto out; } + + INIT_LIST_HEAD(&entry->list); + list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } if (IS_INODE(page)) { -- cgit v1.2.1 From 64c576fe51bc6b19e99340d2d0e1bda89f66db25 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 22 Dec 2012 12:10:27 +0900 Subject: f2fs: remove unneeded variable from f2fs_sync_fs We can directly return '0' from the function, instead of introducing a 'ret' variable. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cf0ffb800654..08a94c814bdc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -119,7 +119,6 @@ static void f2fs_put_super(struct super_block *sb) int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret = 0; if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; @@ -127,7 +126,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (sync) write_checkpoint(sbi, false, false); - return ret; + return 0; } static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) -- cgit v1.2.1 From ce19a5d4321911f98d42e4d724630ae48f413719 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Dec 2012 12:03:22 +0900 Subject: f2fs: clean up the start_bidx_of_node function This patch also resolves the following warning reported by kbuild test robot. fs/f2fs/gc.c: In function 'start_bidx_of_node': fs/f2fs/gc.c:453:21: warning: 'bidx' may be used uninitialized in this function Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 644aa3808273..eda8230deb0c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -430,28 +430,22 @@ next_step: */ block_t start_bidx_of_node(unsigned int node_ofs) { - block_t start_bidx; - unsigned int bidx, indirect_blks; - int dec; + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; - indirect_blks = 2 * NIDS_PER_BLOCK + 4; + if (node_ofs == 0) + return 0; - start_bidx = 1; - if (node_ofs == 0) { - start_bidx = 0; - } else if (node_ofs <= 2) { + if (node_ofs <= 2) { bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { - dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 2 - dec; } else { - dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - - if (start_bidx) - start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; - return start_bidx; + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; } static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, -- cgit v1.2.1 From 2b50638decdb9a8585654a5acf1c8ce5962f1951 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Dec 2012 14:39:50 +0900 Subject: f2fs: clean up unused variables and return values This patch cleans up a couple of unnecessary codes related to unused variables and return values. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +++--------- fs/f2fs/hash.c | 4 +--- fs/f2fs/node.c | 6 +----- 3 files changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index eda8230deb0c..b0ec721e984a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -390,9 +390,7 @@ next_step: } err = check_valid_map(sbi, segno, off); - if (err == GC_ERROR) - return err; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (initial) { @@ -550,9 +548,7 @@ next_step: } err = check_valid_map(sbi, segno, off); - if (err == GC_ERROR) - goto stop; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (phase == 0) { @@ -562,9 +558,7 @@ next_step: /* Get an inode by ino with checking validity */ err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); - if (err == GC_ERROR) - goto stop; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (phase == 1) { diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 5e48baca3597..6977415c52fc 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -71,7 +71,7 @@ static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) f2fs_hash_t f2fs_dentry_hash(const char *name, int len) { - __u32 hash, minor_hash; + __u32 hash; f2fs_hash_t f2fs_hash; const char *p; __u32 in[8], buf[4]; @@ -94,8 +94,6 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len) p += 16; } hash = buf[0]; - minor_hash = buf[1]; - f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); return f2fs_hash; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e85643cc74a9..5066bfd256c9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1088,7 +1088,6 @@ static int f2fs_write_node_page(struct page *page, { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); nid_t nid; - unsigned int nofs; block_t new_addr; struct node_info ni; @@ -1105,7 +1104,6 @@ static int f2fs_write_node_page(struct page *page, /* get old block addr of this node page */ nid = nid_of_node(page); - nofs = ofs_of_node(page); BUG_ON(page->index != nid); get_node_info(sbi, nid, &ni); @@ -1566,7 +1564,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; - block_t old_blkaddr, new_blkaddr; + block_t new_blkaddr; ne = list_entry(cur, struct nat_entry, list); nid = nat_get_nid(ne); @@ -1580,7 +1578,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); if (offset >= 0) { raw_ne = nat_in_journal(sum, offset); - old_blkaddr = le32_to_cpu(raw_ne.block_addr); goto flush_now; } to_nat_page: @@ -1602,7 +1599,6 @@ to_nat_page: BUG_ON(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; - old_blkaddr = le32_to_cpu(raw_ne.block_addr); flush_now: new_blkaddr = nat_get_blkaddr(ne); -- cgit v1.2.1 From 9836b8b9499cb25ea32cad9fff640eef874c5431 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 27 Dec 2012 19:55:46 +0200 Subject: f2fs: unify string length declarations and usage This patch is intended to unify string length declarations and usage. There are number of calls to strlen which return size_t object. The size of this object depends on compiler if it will be bigger, equal or even smaller than an unsigned int Signed-off-by: Leon Romanovsky Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 10 +++++----- fs/f2fs/f2fs.h | 2 +- fs/f2fs/hash.c | 10 ++++++---- fs/f2fs/namei.c | 6 +++--- fs/f2fs/xattr.c | 5 +++-- 5 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4a78d6c4f3a7..951ed52748f6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -75,7 +75,7 @@ static unsigned long dir_block_index(unsigned int level, unsigned int idx) return bidx; } -static bool early_match_name(const char *name, int namelen, +static bool early_match_name(const char *name, size_t namelen, f2fs_hash_t namehash, struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) @@ -88,7 +88,7 @@ static bool early_match_name(const char *name, int namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, int namelen, int *max_slots, + const char *name, size_t namelen, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -127,7 +127,7 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, int namelen, + unsigned int level, const char *name, size_t namelen, f2fs_hash_t namehash, struct page **res_page) { int s = GET_DENTRY_SLOTS(namelen); @@ -182,7 +182,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { const char *name = child->name; - int namelen = child->len; + size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -383,7 +383,7 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = dentry->d_parent->d_inode; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + size_t namelen = dentry->d_name.len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a18d63db2fb6..13c6dfbb7183 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -881,7 +881,7 @@ int f2fs_sync_fs(struct super_block *, int); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, int); +f2fs_hash_t f2fs_dentry_hash(const char *, size_t); /* * node.c diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6977415c52fc..6eb8d269b53b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -42,7 +42,7 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[]) buf[1] += b1; } -static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) +static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) { unsigned pad, val; int i; @@ -69,7 +69,7 @@ static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, int len) +f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -87,11 +87,13 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len) buf[3] = 0x10325476; p = name; - while (len > 0) { + while (1) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); - len -= 16; p += 16; + if (len <= 16) + break; + len -= 16; } hash = buf[0]; f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b42389f80011..1a49b881bac0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -77,8 +77,8 @@ fail: static int is_multimedia_file(const unsigned char *s, const char *sub) { - int slen = strlen(s); - int sublen = strlen(sub); + size_t slen = strlen(s); + size_t sublen = strlen(sub); int ret; if (sublen > slen) @@ -250,7 +250,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - unsigned symlen = strlen(symname) + 1; + size_t symlen = strlen(symname) + 1; int err; f2fs_balance_fs(sbi); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7d52e8dc0c59..940136a3d3a6 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -208,7 +208,7 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, struct page *page; void *base_addr; int error = 0, found = 0; - int value_len, name_len; + size_t value_len, name_len; if (name == NULL) return -EINVAL; @@ -304,7 +304,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, struct f2fs_xattr_entry *here, *last; struct page *page; void *base_addr; - int error, found, free, name_len, newsize; + int error, found, free, newsize; + size_t name_len; char *pval; if (name == NULL) -- cgit v1.2.1 From ea702b80e0bbb2448e201472127288beb82ca2fe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 27 Dec 2012 07:28:55 -0500 Subject: cifs: move check for NULL socket into smb_send_rqst Cai reported this oops: [90701.616664] BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 [90701.625438] IP: [] kernel_setsockopt+0x2e/0x60 [90701.632167] PGD fea319067 PUD 103fda4067 PMD 0 [90701.637255] Oops: 0000 [#1] SMP [90701.640878] Modules linked in: des_generic md4 nls_utf8 cifs dns_resolver binfmt_misc tun sg igb iTCO_wdt iTCO_vendor_support lpc_ich pcspkr i2c_i801 i2c_core i7core_edac edac_core ioatdma dca mfd_core coretemp kvm_intel kvm crc32c_intel microcode sr_mod cdrom ata_generic sd_mod pata_acpi crc_t10dif ata_piix libata megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [90701.677655] CPU 10 [90701.679808] Pid: 9627, comm: ls Tainted: G W 3.7.1+ #10 QCI QSSC-S4R/QSSC-S4R [90701.688950] RIP: 0010:[] [] kernel_setsockopt+0x2e/0x60 [90701.698383] RSP: 0018:ffff88177b431bb8 EFLAGS: 00010206 [90701.704309] RAX: ffff88177b431fd8 RBX: 00007ffffffff000 RCX: ffff88177b431bec [90701.712271] RDX: 0000000000000003 RSI: 0000000000000006 RDI: 0000000000000000 [90701.720223] RBP: ffff88177b431bc8 R08: 0000000000000004 R09: 0000000000000000 [90701.728185] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001 [90701.736147] R13: ffff88184ef92000 R14: 0000000000000023 R15: ffff88177b431c88 [90701.744109] FS: 00007fd56a1a47c0(0000) GS:ffff88105fc40000(0000) knlGS:0000000000000000 [90701.753137] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [90701.759550] CR2: 0000000000000028 CR3: 000000104f15f000 CR4: 00000000000007e0 [90701.767512] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [90701.775465] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [90701.783428] Process ls (pid: 9627, threadinfo ffff88177b430000, task ffff88185ca4cb60) [90701.792261] Stack: [90701.794505] 0000000000000023 ffff88177b431c50 ffff88177b431c38 ffffffffa014fcb1 [90701.802809] ffff88184ef921bc 0000000000000000 00000001ffffffff ffff88184ef921c0 [90701.811123] ffff88177b431c08 ffffffff815ca3d9 ffff88177b431c18 ffff880857758000 [90701.819433] Call Trace: [90701.822183] [] smb_send_rqst+0x71/0x1f0 [cifs] [90701.828991] [] ? schedule+0x29/0x70 [90701.834736] [] smb_sendv+0x3d/0x40 [cifs] [90701.841062] [] smb_send+0x26/0x30 [cifs] [90701.847291] [] send_nt_cancel+0x6f/0xd0 [cifs] [90701.854102] [] SendReceive+0x18e/0x360 [cifs] [90701.860814] [] CIFSFindFirst+0x1a8/0x3f0 [cifs] [90701.867724] [] ? build_path_from_dentry+0xf1/0x260 [cifs] [90701.875601] [] ? build_path_from_dentry+0xf1/0x260 [cifs] [90701.883477] [] cifs_query_dir_first+0x26/0x30 [cifs] [90701.890869] [] initiate_cifs_search+0xed/0x250 [cifs] [90701.898354] [] ? fillonedir+0x100/0x100 [90701.904486] [] cifs_readdir+0x45b/0x8f0 [cifs] [90701.911288] [] ? fillonedir+0x100/0x100 [90701.917410] [] ? fillonedir+0x100/0x100 [90701.923533] [] ? fillonedir+0x100/0x100 [90701.929657] [] vfs_readdir+0xb8/0xe0 [90701.935490] [] sys_getdents+0x8f/0x110 [90701.941521] [] system_call_fastpath+0x16/0x1b [90701.948222] Code: 66 90 55 65 48 8b 04 25 f0 c6 00 00 48 89 e5 53 48 83 ec 08 83 fe 01 48 8b 98 48 e0 ff ff 48 c7 80 48 e0 ff ff ff ff ff ff 74 22 <48> 8b 47 28 ff 50 68 65 48 8b 14 25 f0 c6 00 00 48 89 9a 48 e0 [90701.970313] RIP [] kernel_setsockopt+0x2e/0x60 [90701.977125] RSP [90701.981018] CR2: 0000000000000028 [90701.984809] ---[ end trace 24bd602971110a43 ]--- This is likely due to a race vs. a reconnection event. The current code checks for a NULL socket in smb_send_kvec, but that's too late. By the time that check is done, the socket will already have been passed to kernel_setsockopt. Move the check into smb_send_rqst, so that it's checked earlier. In truth, this is a bit of a half-assed fix. The -ENOTSOCK error return here looks like it could bubble back up to userspace. The locking rules around the ssocket pointer are really unclear as well. There are cases where the ssocket pointer is changed without holding the srv_mutex, but I'm not clear whether there's a potential race here yet or not. This code seems like it could benefit from some fundamental re-think of how the socket handling should behave. Until then though, this patch should at least fix the above oops in most cases. Cc: # 3.7+ Reported-and-Tested-by: CAI Qian Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 76d974c952fe..1a528680ec5a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, *sent = 0; - if (ssocket == NULL) - return -ENOTSOCK; /* BB eventually add reconnect code here */ - smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; @@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct socket *ssocket = server->ssocket; int val = 1; + if (ssocket == NULL) + return -ENOTSOCK; + cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); -- cgit v1.2.1 From 31efee60f489c759c341454d755a9fd13de8c03d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 27 Dec 2012 08:05:03 -0500 Subject: cifs: adjust sequence number downward after signing NT_CANCEL request When a call goes out, the signing code adjusts the sequence number upward by two to account for the request and the response. An NT_CANCEL however doesn't get a response of its own, it just hurries the server along to get it to respond to the original request more quickly. Therefore, we must adjust the sequence number back down by one after signing a NT_CANCEL request. Cc: Reported-by: Tim Perry Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/smb1ops.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a5d234c8d5d9..dd79056c0581 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, mutex_unlock(&server->srv_mutex); return rc; } + + /* + * The response to this call was already factored into the sequence + * number when the call went out, so we must adjust it back downward + * after signing here. + */ + --server->sequence_number; rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); mutex_unlock(&server->srv_mutex); -- cgit v1.2.1 From ca8aa29c60238720af2ca2a5caab25fa0c70067e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 21 Dec 2012 15:05:47 +0400 Subject: Revert "CIFS: Fix write after setting a read lock for read oplock files" that solution has data races and can end up two identical writes to the server: when clientCanCacheAll value can be changed during the execution of __generic_file_aio_write. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 - fs/cifs/cifsglob.h | 1 - fs/cifs/file.c | 94 ++++++++++++++++++------------------------------------ 3 files changed, 31 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f653835d067b..de7f9168a118 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -228,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; - cifs_inode->leave_pages_clean = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index aea1eec64911..dfab450a191e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1030,7 +1030,6 @@ struct cifsInodeInfo { bool clientCanCacheAll; /* read and writebehind oplock */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ - bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a6677ba212b..1b322d041f1e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2103,15 +2103,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } else { rc = copied; pos += copied; - /* - * When we use strict cache mode and cifs_strict_writev was run - * with level II oplock (indicated by leave_pages_clean field of - * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev - * sent the data to the server itself. - */ - if (!CIFS_I(inode)->leave_pages_clean || - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) - set_page_dirty(page); + set_page_dirty(page); } if (rc > 0) { @@ -2462,8 +2454,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool cache_ex) +cifs_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2485,12 +2477,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); - if (!cache_ex) - cinode->leave_pages_clean = true; rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); - if (!cache_ex) - cinode->leave_pages_clean = false; + &iocb->ki_pos); mutex_unlock(&inode->i_mutex); } @@ -2517,62 +2505,42 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - ssize_t written, written2; + +#ifdef CONFIG_CIFS_SMB2 /* - * We need to store clientCanCacheAll here to prevent race - * conditions - this value can be changed during an execution - * of generic_file_aio_write. For CIFS it can be changed from - * true to false only, but for SMB2 it can be changed both from - * true to false and vice versa. So, we can end up with a data - * stored in the cache, not marked dirty and not sent to the - * server if this value changes its state from false to true - * after cifs_write_end. + * If we have an oplock for read and want to write a data to the file + * we need to store it in the page cache and then push it to the server + * to be sure the next read will get a valid data. */ - bool cache_ex = cinode->clientCanCacheAll; - bool cache_read = cinode->clientCanCacheRead; - int rc; - loff_t saved_pos; + if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { + ssize_t written; + int rc; + + written = generic_file_aio_write(iocb, iov, nr_segs, pos); + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return (ssize_t)rc; - if (cache_ex) { - if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); + return written; } +#endif /* - * For files without exclusive oplock in strict cache mode we need to - * write the data to the server exactly from the pos to pos+len-1 rather - * than flush all affected pages because it may cause a error with - * mandatory locks on these pages but not on the region from pos to - * ppos+len-1. + * For non-oplocked files in strict cache mode we need to write the data + * to the server exactly from the pos to pos+len-1 rather than flush all + * affected pages because it may cause a error with mandatory locks on + * these pages but not on the region from pos to ppos+len-1. */ - written = cifs_user_writev(iocb, iov, nr_segs, pos); - if (!cache_read || written <= 0) - return written; - saved_pos = iocb->ki_pos; - iocb->ki_pos = pos; - /* we have a read oplock - need to store a data in the page cache */ + if (!cinode->clientCanCacheAll) + return cifs_user_writev(iocb, iov, nr_segs, pos); + if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) - written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); - else - written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, - cache_ex); - /* errors occured during writing - invalidate the page cache */ - if (written2 < 0) { - rc = cifs_invalidate_mapping(inode); - if (rc) - written = (ssize_t)rc; - else - iocb->ki_pos = saved_pos; - } - return written; + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + + return cifs_writev(iocb, iov, nr_segs, pos); } static struct cifs_readdata * -- cgit v1.2.1 From 88cf75aaaf27a652b3e85960ac3060172dd3edac Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 21 Dec 2012 15:07:52 +0400 Subject: CIFS: Fix write after setting a read lock for read oplock files If we have a read oplock and set a read lock in it, we can't write to the locked area - so, filemap_fdatawrite may fail with a no information for a userspace application even if we request a write to non-locked area. Fix this by writing directly to the server and then breaking oplock level from level2 to None. Also remove CONFIG_CIFS_SMB2 ifdefs because it's suitable for both CIFS and SMB2 protocols. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1b322d041f1e..22c37254b64e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2505,42 +2505,34 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + ssize_t written; -#ifdef CONFIG_CIFS_SMB2 - /* - * If we have an oplock for read and want to write a data to the file - * we need to store it in the page cache and then push it to the server - * to be sure the next read will get a valid data. - */ - if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { - ssize_t written; - int rc; - - written = generic_file_aio_write(iocb, iov, nr_segs, pos); - rc = filemap_fdatawrite(inode->i_mapping); - if (rc) - return (ssize_t)rc; - - return written; + if (cinode->clientCanCacheAll) { + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + return cifs_writev(iocb, iov, nr_segs, pos); } -#endif - /* * For non-oplocked files in strict cache mode we need to write the data * to the server exactly from the pos to pos+len-1 rather than flush all * affected pages because it may cause a error with mandatory locks on * these pages but not on the region from pos to ppos+len-1. */ - - if (!cinode->clientCanCacheAll) - return cifs_user_writev(iocb, iov, nr_segs, pos); - - if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - - return cifs_writev(iocb, iov, nr_segs, pos); + written = cifs_user_writev(iocb, iov, nr_segs, pos); + if (written > 0 && cinode->clientCanCacheRead) { + /* + * Windows 7 server can delay breaking level2 oplock if a write + * request comes - break it on the client to prevent reading + * an old data. + */ + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p after a write operation", + inode); + cinode->clientCanCacheRead = false; + } + return written; } static struct cifs_readdata * -- cgit v1.2.1 From 63b7d3a41ccadef971a4ffbe6662119d4275ebf9 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 24 Dec 2012 14:41:19 +0400 Subject: CIFS: Don't let read only caching for mandatory byte-range locked files If we have mandatory byte-range locks on a file we can't cache reads because pagereading may have conflicts with these locks on the server. That's why we should allow level2 oplocks for files without mandatory locks only. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/file.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/cifs/smb1ops.c | 1 + fs/cifs/smb2ops.c | 2 ++ 4 files changed, 57 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index dfab450a191e..e6899cea1c35 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -386,6 +386,7 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; + unsigned int oplock_read; }; #define HEADER_SIZE(server) (server->vals->header_size) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22c37254b64e..8ea6ca50a665 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -238,6 +238,23 @@ out: return rc; } +static bool +cifs_has_mand_locks(struct cifsInodeInfo *cinode) +{ + struct cifs_fid_locks *cur; + bool has_locks = false; + + down_read(&cinode->lock_sem); + list_for_each_entry(cur, &cinode->llist, llist) { + if (!list_empty(&cur->locks)) { + has_locks = true; + break; + } + } + up_read(&cinode->lock_sem); + return has_locks; +} + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -248,6 +265,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; struct cifs_tcon *tcon = tlink_tcon(tlink); + struct TCP_Server_Info *server = tcon->ses->server; cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (cfile == NULL) @@ -276,12 +294,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (oplock == server->vals->oplock_read && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock val from read to None due to mand locks"); + oplock = 0; + } + spin_lock(&cifs_file_list_lock); - if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) + if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); - tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); + server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); /* if readable file instance put first in list*/ @@ -1422,6 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + struct inode *inode = cfile->dentry->d_inode; if (posix_lck) { int posix_lock_type; @@ -1459,6 +1488,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (!rc) goto out; + /* + * Windows 7 server can delay breaking lease from read to None + * if we set a byte-range lock on a file - break it explicitly + * before sending the lock to the server to be sure the next + * read won't conflict with non-overlapted locks due to + * pagereading. + */ + if (!CIFS_I(inode)->clientCanCacheAll && + CIFS_I(inode)->clientCanCacheRead) { + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p due to mand locks", + inode); + CIFS_I(inode)->clientCanCacheRead = false; + } + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 1, 0, wait_flag); if (rc) { @@ -3537,6 +3581,13 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; + if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock to None for inode=%p due to mand locks", + inode); + cinode->clientCanCacheRead = false; + } + if (inode && S_ISREG(inode->i_mode)) { if (cinode->clientCanCacheRead) break_lease(inode, O_RDONLY); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index dd79056c0581..47bc5a87f94e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -959,4 +959,5 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, + .oplock_read = OPLOCK_READ, }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d79de7bc4435..c9c7aa7ed966 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -708,6 +708,7 @@ struct smb_version_values smb20_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb21_values = { @@ -725,6 +726,7 @@ struct smb_version_values smb21_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb30_values = { -- cgit v1.2.1 From ec1487528bed94c4aaff3687834fe94203880fd6 Mon Sep 17 00:00:00 2001 From: Nathan Straz Date: Tue, 11 Dec 2012 17:01:24 -0500 Subject: GFS2: Initialize hex string to '0' When generating the DLM lock name, a value of 0 would skip the loop and leave the string unchanged. This left locks with a value of 0 unlabeled. Initializing the string to '0' fixes this. Signed-off-by: Nathan Straz Signed-off-by: Steven Whitehouse --- fs/gfs2/lock_dlm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8dad6b093716..b906ed17a839 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -241,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, static void gfs2_reverse_hex(char *c, u64 value) { + *c = '0'; while (value) { *c-- = hex_asc[value & 0x0f]; value >>= 4; -- cgit v1.2.1 From f1213cacc7ffc7d4cdef3692f22b28a2df3216f5 Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Wed, 19 Dec 2012 10:48:01 -0500 Subject: GFS2: Fix race in gfs2_rs_alloc QE aio tests uncovered a race condition in gfs2_rs_alloc where it's possible to come out of the function with a valid ip->i_res allocation but it gets freed before use resulting in a NULL ptr dereference. This patch envelopes the initial short-circuit check for non-NULL ip->i_res into the mutex lock. With this patch, I was able to successfully run the reproducer test multiple times. Resolves: rhbz#878476 Signed-off-by: Abhi Das Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 37ee061d899e..738b3888adc6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -557,22 +557,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) */ int gfs2_rs_alloc(struct gfs2_inode *ip) { - struct gfs2_blkreserv *res; + int error = 0; + down_write(&ip->i_rw_mutex); if (ip->i_res) - return 0; - - res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); - if (!res) - return -ENOMEM; + goto out; - RB_CLEAR_NODE(&res->rs_node); + ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!ip->i_res) { + error = -ENOMEM; + goto out; + } - down_write(&ip->i_rw_mutex); - if (ip->i_res) - kmem_cache_free(gfs2_rsrv_cachep, res); - else - ip->i_res = res; + RB_CLEAR_NODE(&ip->i_res->rs_node); +out: up_write(&ip->i_rw_mutex); return 0; } -- cgit v1.2.1 From 15bd50ad82a6d3421af1abe82e2554898abc4141 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 20 Dec 2012 13:21:07 -0500 Subject: GFS2: Stop looking for free blocks at end of rgrp This patch adds a return code check after calling function gfs2_rbm_from_block while determining the free extent size. That way, when the end of an rgrp is reached, it won't try to process unaligned blocks after the end. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 738b3888adc6..712dd4fd8641 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -350,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) BUG_ON(len < chunk_size); len -= chunk_size; block = gfs2_rbm_to_block(&rbm); - gfs2_rbm_from_block(&rbm, block + chunk_size); - n_unaligned = 3; - if (ptr) + if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; break; + } + if (ptr) { + n_unaligned = 3; + break; + } n_unaligned = len & 3; } -- cgit v1.2.1 From 13d2eb012927b03ac1b80202af5aa9abc4003bd5 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 20 Dec 2012 13:23:04 -0500 Subject: GFS2: Reset rd_last_alloc when it reaches the end of the rgrp In function rg_mblk_search, it's searching for multiple blocks in a given state (e.g. "free"). If there's an active block reservation its goal is the next free block of that. If the resource group contains the dinode's goal block, that's used for the search. But if neither is the case, it uses the rgrp's last allocated block. That way, consecutive allocations appear after one another on media. The problem comes in when you hit the end of the rgrp; it would never start over and search from the beginning. This became a problem, since if you deleted all the files and data from the rgrp, it would never start over and find free blocks. So it had to keep searching further out on the media to allocate blocks. This patch resets the rd_last_alloc after it does an unsuccessful search at the end of the rgrp. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 712dd4fd8641..b7eff078fe90 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1426,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, rs->rs_free = extlen; rs->rs_inum = ip->i_no_addr; rs_insert(ip); + } else { + if (goal == rgd->rd_last_alloc + rgd->rd_data0) + rgd->rd_last_alloc = 0; } } -- cgit v1.2.1 From 128dd1759d96ad36c379240f8b9463e8acfd37a1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 1 Jan 2013 21:20:27 +0000 Subject: epoll: prevent missed events on EPOLL_CTL_MOD EPOLL_CTL_MOD sets the interest mask before calling f_op->poll() to ensure events are not missed. Since the modifications to the interest mask are not protected by the same lock as ep_poll_callback, we need to ensure the change is visible to other CPUs calling ep_poll_callback. We also need to ensure f_op->poll() has an up-to-date view of past events which occured before we modified the interest mask. So this barrier also pairs with the barrier in wq_has_sleeper(). This should guarantee either ep_poll_callback or f_op->poll() (or both) will notice the readiness of a recently-ready/modified item. This issue was encountered by Andreas Voellmy and Junchang(Jason) Wang in: http://thread.gmane.org/gmane.linux.kernel/1408782/ Signed-off-by: Eric Wong Cc: Hans Verkuil Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Al Viro Cc: Davide Libenzi Cc: Hans de Goede Cc: Mauro Carvalho Chehab Cc: David Miller Cc: Eric Dumazet Cc: Andrew Morton Cc: Andreas Voellmy Tested-by: "Junchang(Jason) Wang" Cc: netdev@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index be56b21435f8..9fec1836057a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1313,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ - epi->event.events = event->events; + epi->event.events = event->events; /* need barrier below */ pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { @@ -1323,6 +1323,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even ep_destroy_wakeup_source(epi); } + /* + * The following barrier has two effects: + * + * 1) Flush epi changes above to other CPUs. This ensures + * we do not miss events from ep_poll_callback if an + * event occurs immediately after we call f_op->poll(). + * We need this because we did not take ep->lock while + * changing epi above (but ep_poll_callback does take + * ep->lock). + * + * 2) We also need to ensure we do not miss _past_ events + * when calling f_op->poll(). This barrier also + * pairs with the barrier in wq_has_sleeper (see + * comments for wq_has_sleeper). + * + * This barrier will now guarantee ep_poll_callback or f_op->poll + * (or both) will notice the readiness of an item. + */ + smp_mb(); + /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. -- cgit v1.2.1 From a7a88b23737095e6c18a20c5d4eef9e25ec5b829 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jan 2013 02:04:23 -0800 Subject: mempolicy: remove arg from mpol_parse_str, mpol_to_str Remove the unused argument (formerly no_context) from mpol_parse_str() and from mpol_to_str(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 448455b7fd91..ca5ce7f9f800 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); -- cgit v1.2.1 From f8d9a897d4384b77f13781ea813156568f68b83e Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 3 Jan 2013 16:42:29 -0500 Subject: NFS: Fix access to suid/sgid executables nfs_open_permission_mask() should only check MAY_EXEC for files that are opened with __FMODE_EXEC. Also fix NFSv4 access-in-open path in a similar way -- openflags must be used because fmode will not always have FMODE_EXEC set. This patch fixes https://bugzilla.kernel.org/show_bug.cgi?id=49101 Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/dir.c | 16 ++++++++++------ fs/nfs/nfs4proc.c | 18 +++++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32e6c53520e2..1b2d7eb93796 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2153,12 +2153,16 @@ static int nfs_open_permission_mask(int openflags) { int mask = 0; - if ((openflags & O_ACCMODE) != O_WRONLY) - mask |= MAY_READ; - if ((openflags & O_ACCMODE) != O_RDONLY) - mask |= MAY_WRITE; - if (openflags & __FMODE_EXEC) - mask |= MAY_EXEC; + if (openflags & __FMODE_EXEC) { + /* ONLY check exec rights */ + mask = MAY_EXEC; + } else { + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + } + return mask; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d864fb36578..cf747ef86650 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1626,7 +1626,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) static int nfs4_opendata_access(struct rpc_cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode) + struct nfs4_state *state, fmode_t fmode, + int openflags) { struct nfs_access_entry cache; u32 mask; @@ -1638,11 +1639,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred, mask = 0; /* don't check MAY_WRITE - a newly created file may not have - * write mode bits, but POSIX allows the creating process to write */ - if (fmode & FMODE_READ) - mask |= MAY_READ; - if (fmode & FMODE_EXEC) - mask |= MAY_EXEC; + * write mode bits, but POSIX allows the creating process to write. + * use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. */ + if (openflags & __FMODE_EXEC) { + /* ONLY check for exec rights */ + mask = MAY_EXEC; + } else if (fmode & FMODE_READ) + mask = MAY_READ; cache.cred = cred; cache.jiffies = jiffies; @@ -1896,7 +1900,7 @@ static int _nfs4_do_open(struct inode *dir, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - status = nfs4_opendata_access(cred, opendata, state, fmode); + status = nfs4_opendata_access(cred, opendata, state, fmode, flags); if (status != 0) goto err_opendata_put; -- cgit v1.2.1 From f568f6ca811fe681ecfd11c4ce78b6aa488020c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:02:05 -0800 Subject: pstore: remove __dev* attributes. CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the use of __devinit from the pstore filesystem. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram.c | 14 ++++++-------- fs/pstore/ram_core.c | 9 ++++----- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f883e7e74305..7003e5266f25 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -291,9 +291,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt) kfree(cxt->przs); } -static int __devinit ramoops_init_przs(struct device *dev, - struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) { int err = -ENOMEM; int i; @@ -336,10 +335,9 @@ fail_prz: return err; } -static int __devinit ramoops_init_prz(struct device *dev, - struct ramoops_context *cxt, - struct persistent_ram_zone **prz, - phys_addr_t *paddr, size_t sz, u32 sig) +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) { if (!sz) return 0; @@ -367,7 +365,7 @@ static int __devinit ramoops_init_prz(struct device *dev, return 0; } -static int __devinit ramoops_probe(struct platform_device *pdev) +static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = pdev->dev.platform_data; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index eecd2a8a84dd..0306303be372 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return 0; } -static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, - u32 sig, int ecc_size) +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, + int ecc_size) { int ret; @@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) kfree(prz); } -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, - size_t size, u32 sig, - int ecc_size) +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, int ecc_size) { struct persistent_ram_zone *prz; int ret = -ENOMEM; -- cgit v1.2.1 From 6ae141718e3f9c7e2c620e999c86612a7f415bb1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:16:45 -0800 Subject: misc: remove __dev* attributes. CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the last of the __dev* markings from the kernel from a variety of different, tiny, places. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Signed-off-by: Greg Kroah-Hartman --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 15cb8618e95d..2b3570b7caeb 100644 --- a/fs/file.c +++ b/fs/file.c @@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk) } } -static void __devinit fdtable_defer_list_init(int cpu) +static void fdtable_defer_list_init(int cpu) { struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); spin_lock_init(&fddef->lock); -- cgit v1.2.1 From a07ef784356cf9157bd9bed5254cbb9a82d33722 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:52:05 +0900 Subject: f2fs: introduce f2fs_msg to ease adding information prints Introduced f2fs_msg function to differentiate f2fs specific messages in the log. And, added few informative prints in the mount path, to convey proper error in case of mount failure. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 80 +++++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13c6dfbb7183..8199ee9f5875 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -877,6 +877,8 @@ bool f2fs_empty_dir(struct inode *); * super.c */ int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08a94c814bdc..afa7ef0c4ba7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -247,7 +259,8 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct f2fs_sb_info *sbi, char *options) +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, + char *options) { substring_t args[MAX_OPT_ARGS]; char *p; @@ -286,7 +299,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_nouser_xattr: - pr_info("nouser_xattr options not supported\n"); + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -295,7 +309,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_noacl: - pr_info("noacl options not supported\n"); + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; #endif case Opt_active_logs: @@ -309,8 +323,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: - pr_err("Unrecognized mount option \"%s\" or missing value\n", - p); + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -337,23 +352,36 @@ static loff_t max_file_size(unsigned bits) return result; } -static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) { unsigned int blocksize; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return 1; + } /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) + if (blocksize != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); return 1; + } if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) + F2FS_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); return 1; + } if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) + F2FS_LOG_SECTORS_PER_BLOCK) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); return 1; + } return 0; } @@ -414,13 +442,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a temporary block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; + } /* read f2fs raw super block */ raw_super_buf = sb_bread(sb, 0); if (!raw_super_buf) { err = -EIO; + f2fs_msg(sb, KERN_ERR, "unable to read superblock"); goto free_sbi; } raw_super = (struct f2fs_super_block *) @@ -438,12 +469,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sbi, (char *)data)) + if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; /* sanity checking of raw super */ - if (sanity_check_raw_super(raw_super)) + if (sanity_check_raw_super(sb, raw_super)) { + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem"); goto free_sb_buf; + } sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; @@ -477,18 +510,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_sb_buf; } err = get_valid_checkpoint(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; + } /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) + if (sanity_check_ckpt(raw_super, sbi->ckpt)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; + } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); @@ -510,17 +548,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* setup f2fs internal modules */ err = build_segment_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); goto free_sm; + } err = build_node_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); goto free_nm; + } build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_nm; } @@ -533,6 +578,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } -- cgit v1.2.1 From 3af60a49fd2edfe9c5a06bc84d4832450895be96 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:52:37 +0900 Subject: f2fs: fix time update in case of f2fs fallocate After doing a punch hole or expanding inode doing fallocation. The change and modification time are not update for the file. So, update time after no issue is observed in fallocate. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7f9ea9271ebe..88593c5e743c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -545,6 +545,11 @@ static long f2fs_fallocate(struct file *file, int mode, else ret = expand_inode_data(inode, offset, len, mode); + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } + f2fs_balance_fs(sbi); return ret; } -- cgit v1.2.1 From 24c366a9ea256b86426b42e75f764495a2558861 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:53:08 +0900 Subject: f2fs: remove unneeded INIT_LIST_HEAD at few places While creating a new entry for addition to the list(orphan inode list and fsync inode entry list), there is no need to call HEAD initialization for these entries. So, remove that init part. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 - fs/f2fs/recovery.c | 1 - 2 files changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6ef36c37e2be..d75c86a17893 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -214,7 +214,6 @@ retry: goto retry; } new->ino = ino; - INIT_LIST_HEAD(&new->list); /* add new_oentry into list which is sorted by inode number */ if (orphan) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b571fee677d5..502c63d8f096 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -151,7 +151,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } -- cgit v1.2.1 From 7880ceedec55fbc3997d80e68670d03395225367 Mon Sep 17 00:00:00 2001 From: Huajun Li Date: Mon, 31 Dec 2012 13:59:09 +0800 Subject: f2fs: update f2fs partition info about SIT/NAT layout Update partition info output under debug FS to reflect segment layout correctly. Signed-off-by: Huajun Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0e0380a588ad..b8ed7a72c6e9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -190,8 +190,8 @@ static int stat_show(struct seq_file *s, void *v) update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); - seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", - si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", -- cgit v1.2.1 From d66d1f76878fcb1e78592fe8aecd13f438d6c0d7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 08:57:21 +0900 Subject: f2fs: initialize newly allocated dnode structure This patch resolves Coverity #753112. In practical, the existing code flow does not fall into the reported errorneous path. But, anyway, let's avoid this for future. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8199ee9f5875..280713289d8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -211,11 +211,11 @@ struct dnode_of_data { static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct page *ipage, struct page *npage, nid_t nid) { + memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_page = ipage; dn->node_page = npage; dn->nid = nid; - dn->inode_page_locked = 0; } /* -- cgit v1.2.1 From c1b75eabec4eddce55ebb078f84481f58272878f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 09:24:28 +0900 Subject: f2fs: avoid null dereference in f2fs_acl_from_disk This patch resolves Coverity #751303: >>> CID 753103: Explicit null dereferenced (FORWARD_NULL) Passing null >>> pointer "value" to function "f2fs_acl_from_disk(char const *, size_t)", which dereferences it. [Error path] - value = NULL; - retval = 0 by f2fs_getxattr(); - f2fs_acl_from_disk(value:NULL, ...); Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index e95b94945d5f..137af4255da6 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", value, retval); } - if (retval < 0) { - if (retval == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(retval); - } else { + if (retval > 0) acl = f2fs_acl_from_disk(value, retval); - } + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); kfree(value); + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); -- cgit v1.2.1 From c335a86930b4841c11df12e1fdfd8345e0ebce84 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 09:33:20 +0900 Subject: f2fs: check return value during recovery This patch resolves Coverity #753102: >>> No check of the return value of "f2fs_add_link(&dent, inode)". Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 502c63d8f096..6cc046d36815 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -67,7 +67,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) kunmap(page); f2fs_put_page(page, 0); } else { - f2fs_add_link(&dent, inode); + err = f2fs_add_link(&dent, inode); } iput(dir); out: -- cgit v1.2.1 From 39e88fcfb1d5c6c4b1ff76ca2ab76cf449b850e8 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Fri, 4 Jan 2013 20:19:49 +0800 Subject: pnfs: Increase the refcount when LAYOUTGET fails the first time The layout will be set unusable if LAYOUTGET fails. Is it reasonable to increase the refcount iff LAYOUTGET fails the first time? Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 3.7] --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e7165d915362..d00260b08103 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -254,7 +254,7 @@ static void pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; - if (test_and_set_bit(fail_bit, &lo->plh_flags)) + if (!test_and_set_bit(fail_bit, &lo->plh_flags)) atomic_inc(&lo->plh_refcount); } -- cgit v1.2.1 From e25fbe380c4e3c09afa98bcdcd9d3921443adab8 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 4 Jan 2013 03:22:57 -0500 Subject: nfs: fix null checking in nfs_get_option_str() The following null pointer check is broken. *option = match_strdup(args); return !option; The pointer `option' must be non-null, and thus `!option' is always false. Use `!*option' instead. The bug was introduced in commit c5cb09b6f8 ("Cleanup: Factor out some cut-and-paste code."). Signed-off-by: Xi Wang Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c25cadf8f8c4..2e7e8c878e5d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1152,7 +1152,7 @@ static int nfs_get_option_str(substring_t args[], char **option) { kfree(*option); *option = match_strdup(args); - return !option; + return !*option; } static int nfs_get_option_ul(substring_t args[], unsigned long *option) -- cgit v1.2.1 From 6db6dd7d3fd8f7c765dabc376493d6791ab28bd6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 4 Jan 2013 12:47:04 -0500 Subject: NFS: Ensure that we free the rpc_task after read and write cleanups are done This patch ensures that we free the rpc_task after the cleanup callbacks are done in order to avoid a deadlock problem that can be triggered if the callback needs to wait for another workqueue item to complete. Signed-off-by: Trond Myklebust Cc: Weston Andros Adamson Cc: Tejun Heo Cc: Bruce Fields Cc: stable@vger.kernel.org [>= 3.5] --- fs/nfs/read.c | 10 +++++++--- fs/nfs/write.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b6bdb18e892c..a5e5d9899d56 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata) put_nfs_open_context(rdata->args.context); if (rdata->pages.pagevec != rdata->pages.page_array) kfree(rdata->pages.pagevec); - if (rdata != &read_header->rpc_data) - kfree(rdata); - else + if (rdata == &read_header->rpc_data) { rdata->header = NULL; + rdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(rdata); } EXPORT_SYMBOL_GPL(nfs_readdata_release); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b673be31590e..c483cc50b82e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata) put_nfs_open_context(wdata->args.context); if (wdata->pages.pagevec != wdata->pages.page_array) kfree(wdata->pages.pagevec); - if (wdata != &write_header->rpc_data) - kfree(wdata); - else + if (wdata == &write_header->rpc_data) { wdata->header = NULL; + wdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(wdata); } EXPORT_SYMBOL_GPL(nfs_writedata_release); -- cgit v1.2.1 From ecf0eb9edbb607d74f74b73c14af8b43f3729528 Mon Sep 17 00:00:00 2001 From: Nickolai Zeldovich Date: Sat, 5 Jan 2013 14:19:51 -0500 Subject: nfs: avoid dereferencing null pointer in initiate_bulk_draining Fix an inverted null pointer check in initiate_bulk_draining(). Signed-off-by: Nickolai Zeldovich Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 3.7] --- fs/nfs/callback_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c89b26bc9759..264d1aa935f2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -206,7 +206,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, list_for_each_entry(lo, &server->layouts, plh_layouts) { ino = igrab(lo->plh_inode); - if (ino) + if (!ino) continue; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ -- cgit v1.2.1 From 96465efee14ecca0cdffcb09f9903635db8fc504 Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Sun, 6 Jan 2013 23:38:44 -0500 Subject: ext4: fix configuration dependencies for ext4 ACLs and security labels Commit "ext4: Remove CONFIG_EXT4_FS_XATTR" removed the configuration dependencies for ext4 xattrs from the ext4 ACLs and security labels configuration options, but did not replace them with a dependency on ext4 itself. Add back the dependency on ext4 so the options only show up if ext4 is enabled. Signed-off-by: Valerie Aurora Signed-off-by: "Theodore Ts'o" Reviewed-by: Tao Ma --- fs/ext4/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 0a475c881852..987358740cb9 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23 config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" + depends on EXT4_FS help Security labels support alternative access control models implemented by security modules like SELinux. This option -- cgit v1.2.1 From 0ecaef0644973e9006fdbc6974301047aaff9bc6 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Sun, 6 Jan 2013 23:38:47 -0500 Subject: ext4: release buffer in failed path in dx_probe() If checksum fails, we should also release the buffer read from previous iteration. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Darrick J. Wong - Cc: stable@vger.kernel.org -- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8990165346ee..f8be1c288a1c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -722,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, ext4_warning(dir->i_sb, "Node failed checksum"); brelse(bh); *err = ERR_BAD_DX_DIR; - goto fail; + goto fail2; } set_buffer_verified(bh); -- cgit v1.2.1 From fef0ebdb229bedce888b63923e2a1ba4e6c6a84c Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Sun, 6 Jan 2013 23:40:25 -0500 Subject: ext4: remove duplicate call to ext4_bread() in ext4_init_new_dir() This fixes a buffer cache leak when creating a directory, introduced in commit a774f9c20. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Tao Ma --- fs/ext4/namei.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f8be1c288a1c..f9ed946a448e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2368,7 +2368,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, } inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { if (!err) { err = -EIO; -- cgit v1.2.1 From ae62ca7b03217be5e74759dc6d7698c95df498b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 6 Jan 2013 18:21:49 +0000 Subject: tcp: fix MSG_SENDPAGE_NOTLAST logic commit 35f9c09fe9c72e (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag : MSG_SENDPAGE_NOTLAST meant to be set on all frags but the last one for a splice() call. The condition used to set the flag in pipe_to_sendpage() relied on splice() user passing the exact number of bytes present in the pipe, or a smaller one. But some programs pass an arbitrary high value, and the test fails. The effect of this bug is a lack of tcp_push() at the end of a splice(pipe -> socket) call, and possibly very slow or erratic TCP sessions. We should both test sd->total_len and fact that another fragment is in the pipe (pipe->nrbufs > 1) Many thanks to Willy for providing very clear bug report, bisection and test programs. Reported-by: Willy Tarreau Bisected-by: Willy Tarreau Tested-by: Willy Tarreau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/splice.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8890604e3fcd..6909d89d0da5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -696,8 +696,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (sd->len < sd->total_len) + + if (sd->len < sd->total_len && pipe->nrbufs > 1) more |= MSG_SENDPAGE_NOTLAST; + return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } -- cgit v1.2.1 From 408e9375610cca6d54e9c654cbe05a647687e12e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 17:55:52 +0900 Subject: f2fs: revisit the f2fs_gc flow I'd like to revisit the f2fs_gc flow and rewrite as follows. 1. In practical, the nGC parameter of f2fs_gc is meaningless. So, let's remove it. 2. Background GC marks victim blocks as dirty one at a time. 3. Foreground GC should do cleaning job until acquiring enough free sections. Afterwards, it needs to do checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 60 +++++++++++++++++++------------------------------------ fs/f2fs/segment.c | 2 +- 3 files changed, 23 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 280713289d8c..285e43d602f3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -986,7 +986,7 @@ int do_write_data_page(struct page *); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *, int); +int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); int create_gc_caches(void); void destroy_gc_caches(void); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0ec721e984a..b4dd90cf1f18 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi, 1) == GC_NONE) + if (f2fs_gc(sbi) == GC_NONE) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -651,62 +651,44 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return ret; } -int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno; - int old_free_secs, cur_free_secs; - int gc_status, nfree; struct list_head ilist; + unsigned int segno, i; int gc_type = BG_GC; + int gc_status = GC_NONE; INIT_LIST_HEAD(&ilist); gc_more: - nfree = 0; - gc_status = GC_NONE; + if (!(sbi->sb->s_flags & MS_ACTIVE)) + goto stop; if (has_not_enough_free_secs(sbi)) - old_free_secs = reserved_sections(sbi); - else - old_free_secs = free_sections(sbi); - - while (sbi->sb->s_flags & MS_ACTIVE) { - int i; - if (has_not_enough_free_secs(sbi)) - gc_type = FG_GC; + gc_type = FG_GC; - cur_free_secs = free_sections(sbi) + nfree; + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + goto stop; - /* We got free space successfully. */ - if (nGC < cur_free_secs - old_free_secs) - break; - - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * do_garbage_collect will give us three gc_status: + * GC_ERROR, GC_DONE, and GC_BLOCKED. + * If GC is finished uncleanly, we have to return + * the victim to dirty segment list. + */ + gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); + if (gc_status != GC_DONE) break; - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, - &ilist, gc_type); - if (gc_status != GC_DONE) - goto stop; - nfree++; - } } -stop: - if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { + if (has_not_enough_free_secs(sbi)) { write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (nfree) + if (has_not_enough_free_secs(sbi)) goto gc_more; } +stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - BUG_ON(!list_empty(&ilist)); return gc_status; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de6240922b0a..4b0099066582 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -31,7 +31,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, 1); + f2fs_gc(sbi); } } -- cgit v1.2.1 From 254adaa465c40151df11fc1f88f93e6e86eb61d4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 9 Jan 2013 17:13:00 -0800 Subject: seq_file: fix new kernel-doc warnings Fix kernel-doc warnings in fs/seq_file.c: Warning(fs/seq_file.c:304): No description found for parameter 'whence' Warning(fs/seq_file.c:304): Excess function parameter 'origin' description in 'seq_lseek' Signed-off-by: Randy Dunlap Cc: Alexander Viro Signed-off-by: Linus Torvalds --- fs/seq_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 9d863fb501f9..f2bc3dfd0b88 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read); * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position - * @origin: 0 for absolute, 1 for relative position + * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ -- cgit v1.2.1 From 7d82db83165dbac8c3f6d47b73c84f38e3996e30 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Jan 2013 13:10:49 +0900 Subject: f2fs: add f2fs_balance_fs in several interfaces The f2fs_balance_fs() is to check the number of free sections and decide whether it needs to conduct cleaning or not. If there are not enough free sections, the cleaning job should be started. In order to control an amount of free sections even under high utilization, f2fs should call f2fs_balance_fs at all the VFS interfaces that are able to produce dirty pages. This patch adds the function calls in the missing interfaces as follows. 1. f2fs_setxattr() The f2fs_setxattr() produces dirty node pages so that we should call f2fs_balance_fs() either likewise doing in other VFS interfaces such as f2fs_lookup(), f2fs_mkdir(), and so on. 2. f2fs_sync_file() We should guarantee serving free sections for syncing metadata during fsync. Previously, there is no space check before triggering checkpoint and sync_node_pages. Therefore, if a bunch of fsync calls are triggered under 100% of FS utilization, f2fs is able to be faced with no free sections, resulting in BUG_ON(). 3. f2fs_sync_fs() Before calling write_checkpoint(), we should guarantee that there are minimum free sections. 4. f2fs_write_inode() f2fs_write_inode() is also able to produce dirty node pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ fs/f2fs/inode.c | 3 +++ fs/f2fs/super.c | 2 ++ fs/f2fs/xattr.c | 2 ++ 4 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 88593c5e743c..7354c2df1087 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -137,6 +137,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bf20b4d03214..794241777322 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -217,6 +217,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (wbc) + f2fs_balance_fs(sbi); + node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index afa7ef0c4ba7..0f2b2eb86a05 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -137,6 +137,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (sync) write_checkpoint(sbi, false, false); + else + f2fs_balance_fs(sbi); return 0; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 940136a3d3a6..8038c0496504 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -318,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > 255 || value_len > MAX_VALUE_LEN) return -ERANGE; + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, NODE_NEW); if (!fi->i_xattr_nid) { /* Allocate new attribute block */ -- cgit v1.2.1 From 9eaeba701386037cdd2ccd8bf8650feb2e2cec31 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Jan 2013 14:09:38 +0900 Subject: f2fs: move f2fs_balance_fs to punch_hole The f2fs_fallocate() has two operations: punch_hole and expand_size. Only in the case of punch_hole, dirty node pages can be produced, so let's trigger f2fs_balance_fs() in this case only. Furthermore, let's trigger it at every data truncation routine. Signed-off-by: Namjae Jeon Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7354c2df1087..819de7f39f26 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -410,6 +410,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, RDONLY_NODE); @@ -537,7 +539,6 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -552,8 +553,6 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } - - f2fs_balance_fs(sbi); return ret; } -- cgit v1.2.1 From f1688e0431d3a395388e70fe21da89ed0de0c323 Mon Sep 17 00:00:00 2001 From: Dave Reisner Date: Wed, 2 Jan 2013 08:54:37 -0500 Subject: debugfs: convert gid= argument from decimal, not octal This patch technically breaks userspace, but I suspect that anyone who actually used this flag would have encountered this brokenness, declared it lunacy, and already sent a patch. Signed-off-by: Dave Reisner Reviewed-by: Vasiliy Kulikov Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 153bb1e42e63..a5f12b7e228d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) opts->uid = uid; break; case Opt_gid: - if (match_octal(&args[0], &option)) + if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) -- cgit v1.2.1 From 6d92d4f6a74766cc885b18218268e0c47fbca399 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 11 Jan 2013 14:31:48 -0800 Subject: fs/exec.c: work around icc miscompilation The tricky problem is this check: if (i++ >= max) icc (mis)optimizes this check as: if (++i > max) The check now becomes a no-op since max is MAX_ARG_STRINGS (0x7FFFFFFF). This is "allowed" by the C standard, assuming i++ never overflows, because signed integer overflow is undefined behavior. This optimization effectively reverts the previous commit 362e6663ef23 ("exec.c, compat.c: fix count(), compat_count() bounds checking") that tries to fix the check. This patch simply moves ++ after the check. Signed-off-by: Xi Wang Cc: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 18c45cac368f..20df02c1cc70 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max) if (IS_ERR(p)) return -EFAULT; - if (i++ >= max) + if (i >= max) return -E2BIG; + ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; -- cgit v1.2.1 From ff9234ad4e974768455071c91bd76402e4af8a28 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jan 2013 14:41:13 +0900 Subject: f2fs: remove redundant call to set_blocksize in f2fs_fill_super Since, f2fs supports only 4KB blocksize, which is set at the beginning in f2fs_fill_super. So, we do not need to again check this blocksize setting in such case. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0f2b2eb86a05..ac127fde8e11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -443,7 +443,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - /* set a temporary block size */ + /* set a block size */ if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; @@ -542,10 +542,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - /* init super block */ - if (!sb_set_blocksize(sb, sbi->blocksize)) - goto free_cp; - init_orphan_info(sbi); /* setup f2fs internal modules */ -- cgit v1.2.1 From 163799872b65b0cbf0091d82971233cc3d2425d3 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jan 2013 14:41:33 +0900 Subject: f2fs: avoid redundant time update for parent directory in f2fs_delete_entry In call to f2fs_delete_entry, 'dir' time modification code is put at two places. So, remove the redundant code for timing update. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 951ed52748f6..989980e16d0b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -503,7 +503,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (inode) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); -- cgit v1.2.1 From cfa7a9ccda711ac6ab8f0d17c3a9b540092d305a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 17 Dec 2012 06:38:51 +0000 Subject: Btrfs: fix memory leak in name_cache_insert() We should free name_cache_entry before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/send.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad40..321b7fb4e441 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); -- cgit v1.2.1 From cc975eb4605c5765a5d5e7a51d24ba5a1cda269e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 7 Dec 2012 10:09:19 +0000 Subject: btrfs: get the device in write mode when deleting it When we're deleting the device we should get it in write mode since we're going to re-write the super block magic on that device. And it should fail if the device is read-only. Signed-off-by: Lukas Czerner --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa74012..86279c37de64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) -- cgit v1.2.1 From d86e56cf7d3669dd292012ac82b986bd1573b6cc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 11:35:41 +0000 Subject: Btrfs: disable qgroup id 0 Qgroup id 0 is a special number, we should set the id of a qgroup to 0. Fix it. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7624212ae926..dd8e3448fe8f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3698,6 +3698,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); -- cgit v1.2.1 From 5c39da5b6ca23e68e7acea7f4c01470383475214 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Oct 2012 11:39:53 +0000 Subject: Btrfs: do not delete a subvolume which is in a R/O subvolume Step to reproduce: # mkfs.btrfs # mount # btrfs sub create /subv0 # btrfs sub snap /subv0/snap0 # change /subv0 from R/W to R/O # btrfs sub del /subv0/snap0 We deleted the snapshot successfully. I think we should not be able to delete the snapshot since the parent subvolume is R/O. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dd8e3448fe8f..5a72896bd769 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2095,13 +2095,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; -- cgit v1.2.1 From dba60f3f5d564167118cad151a7d41dfe8d2a5f7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:19:51 +0000 Subject: Btrfs: fix resize a readonly device We should not resize a readonly device, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a72896bd769..0de21213d05d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1362,6 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1370,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; -- cgit v1.2.1 From 97547676570b3bd908560741315bf4b7d635bcf5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 10:38:50 +0000 Subject: Btrfs: fix missing write access release in btrfs_ioctl_resize() We forget to give up the write access after we find some device operation is going on. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de21213d05d..982c0b9ceea5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1339,6 +1339,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } -- cgit v1.2.1 From 72bcd99d450cb1dde8bf13c3b65fc5883b2a3893 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 15:16:34 -0500 Subject: Btrfs: set flushing if we're limited flushing We still need to say we're flushing if we're limit flushing to keep somebody from coming in and stealing our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd449..61fefda74ff5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } -- cgit v1.2.1 From f3fe820c20a1a36c790545184e734e78d61cd68d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Jan 2013 17:03:21 -0500 Subject: Btrfs: add orphan before truncating pagecache Running xfstests 83 in a loop would sometimes fail the fsck. This happens because if we invalidate a page that already has an ordered extent setup for it we will complete the ordered extent ourselves, assuming that the truncate will clean everything up. The problem with this is there is plenty of time for the truncate to fail after we've done this work. So to fix this we need to add the orphan item first to make sure the cleanup gets done properly, and then we can truncate the pagecache and all that stuff and be safe. This fixes the btrfsck failures I was seeing while running 83 in a loop. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86bb..4ddcf79e7894 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3783,9 +3795,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -6929,11 +6966,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6944,12 +6979,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7018,12 +7047,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { -- cgit v1.2.1 From ac5c93005b7073732e268606688fb6c821d5310e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:24 +0000 Subject: Btrfs: let allocation start from the right raid type This'd avoid us empty looping. Say we have only one disk and the metadata raid type will be defaultly DUP, and we do not need to start from index=0(RAID10) and get over two empty loops to index=2(DUP). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61fefda74ff5..aeba53191ece 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5560,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; -- cgit v1.2.1 From 3268a2468eb6a31af89930cbae58a62fe6ca6d2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 28 Dec 2012 09:33:19 +0000 Subject: Btrfs: reset path lock state to zero We forgot to reset the path lock state to zero after we unlock the path block, and this can lead to the ASSERT checker in tree unlock API. Reported-by: Slava Barinov Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aeba53191ece..85b8454d9608 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6788,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } -- cgit v1.2.1 From 1214b53f90131fee1f950010c43e92455fe598ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 03:53:08 +0000 Subject: Btrfs: fix off-by-one in lseek Lock end is inclusive. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20452c110d7d..fa48051484b8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2242,6 +2242,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); -- cgit v1.2.1 From f9e4fb53938de5db01950c9dfe479703b2f5c964 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 10:10:12 +0000 Subject: Btrfs: fix a bug when llseek for delalloc bytes behind prealloc extents xfstests case 285 complains. It it because btrfs did not try to find unwritten delalloc bytes(only dirty pages, not yet writeback) behind prealloc extents, it ends up finding nothing while we're with SEEK_DATA. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/inode.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa48051484b8..841cfe3be0e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2309,9 +2309,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4ddcf79e7894..ac98384b174e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5623,10 +5623,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5708,6 +5711,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; -- cgit v1.2.1 From f276795627045a3c599a60b476767861e4318c7d Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 8 Jan 2013 19:37:58 +0000 Subject: btrfs: fix btrfs_cont_expand() freeing IS_ERR em btrfs_cont_expand() tries to free an IS_ERR em as it gets an error from btrfs_get_extent() and breaks out of its loop. An instance of -EEXIST was reported in the wild: https://bugzilla.redhat.com/show_bug.cgi?id=874407 I have no idea if that -EEXIST is surprising, or not. Regardless, this error handling should be cleaned up to handle other reasonable errors (ENOMEM, EIO; whatever). This seemed to be the only buggy freeing of the relatively rare IS_ERR em so I opted to fix the caller rather than teach free_extent_map() to use IS_ERR_OR_NULL(). Signed-off-by: Zach Brown Reviewed-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ac98384b174e..3d2c64d4734a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3677,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); -- cgit v1.2.1 From 3972f2603d8570effaf633cea52b12c7c2773c11 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Jan 2013 02:57:22 +0000 Subject: btrfs: update timestamps on truncate() truncate() vs. ftruncate() differ in the VFS; truncate() doesn't set (ATTR_CTIME | ATTR_MTIME), and it's up to the fs to do the timestamp updates if the size changes. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d2c64d4734a..9bc6c40b182d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -3761,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3843,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } -- cgit v1.2.1 From 6d283dba3721cc43be014b50a1acc2f35860a65a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Jan 2013 13:17:50 -0800 Subject: vfs: add missing virtual cache flush after editing partial pages Andrew Morton pointed this out a month ago, and then I completely forgot about it. If we read a partial last page of a block device, we will zero out the end of the page, but since that page can then be mapped into user space, we should also make sure to flush the cache on architectures that have virtual caches. We have the flush_dcache_page() function for this, so use it. Now, in practice this really never matters, because nobody sane uses virtual caches to begin with, and they largely exist on old broken RISC arhitectures. And even if you did run on one of those obsolete CPU's, the whole "mmap and access the last partial page of a block device" behavior probably doesn't actually exist. The normal IO functions (read/write) will never see the zeroed-out part of the page that migth not be coherent in the cache, because they honor the size of the device. So I'm marking this for stable (3.7 only), but I'm not sure anybody will ever care. Pointed-out-by: Andrew Morton Cc: stable@vger.kernel.org # 3.7 Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c017a2dfb909..7a75c3e0fd58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2935,6 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) void *kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); kunmap_atomic(kaddr); + flush_dcache_page(bh->b_page); } } -- cgit v1.2.1 From 7e2fb2d7e6a3094473f101ae33dd6431ae6d2ed1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 18 Dec 2012 11:03:57 -0600 Subject: jbd: don't wake kjournald unnecessarily Don't send an extra wakeup to kjournald in the case where we already have the proper target in j_commit_request, i.e. that commit has already been requested for commit. commit d9b0193 "jbd: fix fsync() tid wraparound bug" changed the logic leading to a wakeup, but it caused some extra wakeups which were found to lead to a measurable performance regression. Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/jbd/journal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index a2862339323b..81cc7eaff863 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target) * currently running transaction (if it exists). Otherwise, * the target tid must be an old one. */ - if (journal->j_running_transaction && + if (journal->j_commit_request != target && + journal->j_running_transaction && journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the -- cgit v1.2.1 From 1b1baff6e50df855238ce5e6c0e7dbb8a261fb32 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 14 Jan 2013 22:53:47 +0100 Subject: UDF: Fix a null pointer dereference in udf_sb_free_partitions This patch fixes a regression caused by commit bff943af6fe "udf: Fix memory leak when mounting" due to which it was triggering a kernel null point dereference in case of interrupted mount OR when allocating memory to sbi->s_partmaps failed in function udf_sb_alloc_partition_maps. Reported-and-tested-by: James Hogan Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Jan Kara --- fs/udf/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index d44fb568abe1..e9be396a558d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -307,7 +307,8 @@ static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; - + if (sbi->s_partmaps == NULL) + return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); kfree(sbi->s_partmaps); -- cgit v1.2.1 From fa9150a84ca333f68127097c4fa1eda4b3913a22 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 15 Jan 2013 16:45:24 +0900 Subject: f2fs: remove the blk_plug usage in f2fs_write_data_pages Let's consider the usage of blk_plug in f2fs_write_data_pages(). We can come up with the two issues: lock contention and task awareness. 1. Merging bios prior to grabing "queue lock" The f2fs merges consecutive IOs in the file system level before submitting any bios, which is similar with the back merge by the plugging mechanism in attempt_plug_merge(). Both of them need to acquire no queue lock. 2. Merging policy with respect to tasks The f2fs merges IOs as much as possible regardless of tasks, while blk-plugging is conducted on a basis of tasks. As we can understand there are trade-offs, f2fs tries to maximize the write performance with well-merged bios. As a result, if f2fs produces many consecutive but separated bios in writepages(), it would be good to use blk-plugging since f2fs would be able to avoid queue lock contention in the block layer by merging them. But, f2fs merges IOs and submit one bio, which means that there are not much chances to merge bios by attempt_plug_merge(). However, f2fs has already been used blk_plug by triggering generic_writepages() in f2fs_write_data_pages(). So to make the overall code consistency, I'd like to remove blk_plug there. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3aa5ce7cab83..b1347fc6d688 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -547,6 +547,15 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -563,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!S_ISDIR(inode->i_mode)) mutex_lock(&sbi->writepages); - ret = generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (!S_ISDIR(inode->i_mode)) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); -- cgit v1.2.1 From 66af62ce7588736ae65edfdb1c0df597775c4d21 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 14 Jan 2013 20:08:16 +0800 Subject: f2fs: add global mutex_lock to protect f2fs_stat_list There is an race condition between umounting f2fs and reading f2fs/status, which results in oops. Fox example: Thread A Thread B umount f2fs cat f2fs/status f2fs_destroy_stats() { stat_show() { list_for_each_entry_safe(&f2fs_stat_list) list_del(&si->stat_list); mutex_lock(&si->stat_lock); si->sbi = NULL; mutex_unlock(&si->stat_lock); kfree(sbi->stat_info); } mutex_lock(&si->stat_lock) <- si is gone. ... } Solution with a global lock: f2fs_stat_mutex: Thread A Thread B umount f2fs cat f2fs/status f2fs_destroy_stats() { stat_show() { mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); kfree(sbi->stat_info); mutex_lock(&f2fs_stat_mutex); } list_for_each_entry_safe(&f2fs_stat_list) ... mutex_unlock(&f2fs_stat_mutex); } Signed-off-by: Jianpeng Ma [jaegeuk.kim@samsung.com: fix typos, description, and remove the existing lock] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b8ed7a72c6e9..73f034a94182 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -26,6 +26,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { @@ -180,13 +181,9 @@ static int stat_show(struct seq_file *s, void *v) int i = 0; int j; + mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { - mutex_lock(&si->stat_lock); - if (!si->sbi) { - mutex_unlock(&si->stat_lock); - continue; - } update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); @@ -286,8 +283,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", (si->base_mem + si->cache_mem) >> 10, si->base_mem >> 10, si->cache_mem >> 10); - mutex_unlock(&si->stat_lock); } + mutex_unlock(&f2fs_stat_mutex); return 0; } @@ -313,9 +310,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return -ENOMEM; si = sbi->stat_info; - mutex_init(&si->stat_lock); - list_add_tail(&si->stat_list, &f2fs_stat_list); - si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -325,6 +319,11 @@ static int init_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); + return 0; } @@ -347,10 +346,10 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; + mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); - mutex_lock(&si->stat_lock); - si->sbi = NULL; - mutex_unlock(&si->stat_lock); + mutex_unlock(&f2fs_stat_mutex); + kfree(sbi->stat_info); } -- cgit v1.2.1 From 4589d25d015c2d02bb5f7075d0cbf6dcf23a33c0 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 15 Jan 2013 19:58:47 +0900 Subject: f2fs: fix the debugfs entry creation path As the "status" debugfs entry will be maintained for entire F2FS filesystem irrespective of the number of partitions. So, we can move the initialization to the init part of the f2fs and destroy will be done from exit part. After making changes, for individual partition mount - entry creation code will not be executed. Signed-off-by: Jianpeng Ma Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 27 ++++++++++----------------- fs/f2fs/f2fs.h | 6 ++++-- fs/f2fs/super.c | 7 +++++-- 3 files changed, 19 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 73f034a94182..c8c37307b326 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -300,7 +300,7 @@ static const struct file_operations stat_fops = { .release = single_release, }; -static int init_stats(struct f2fs_sb_info *sbi) +int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; @@ -327,21 +327,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return 0; } -int f2fs_build_stats(struct f2fs_sb_info *sbi) -{ - int retval; - - retval = init_stats(sbi); - if (retval) - return retval; - - if (!debugfs_root) - debugfs_root = debugfs_create_dir("f2fs", NULL); - - debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); - return 0; -} - void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; @@ -353,7 +338,15 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(sbi->stat_info); } -void destroy_root_stats(void) +void __init f2fs_create_root_stats(void) +{ + debugfs_root = debugfs_create_dir("f2fs", NULL); + if (debugfs_root) + debugfs_create_file("status", S_IRUGO, debugfs_root, + NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) { debugfs_remove_recursive(debugfs_root); debugfs_root = NULL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 285e43d602f3..976325d51e3d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1060,7 +1060,8 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void destroy_root_stats(void); +void f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) #define stat_inc_seg_count(si, type) @@ -1070,7 +1071,8 @@ void destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void destroy_root_stats(void) { } +static inline void f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ac127fde8e11..d551a724b736 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -675,14 +675,17 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto fail; - return register_filesystem(&f2fs_fs_type); + err = register_filesystem(&f2fs_fs_type); + if (err) + goto fail; + f2fs_create_root_stats(); fail: return err; } static void __exit exit_f2fs_fs(void) { - destroy_root_stats(); + f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); -- cgit v1.2.1 From d44d9bc68e32ad5881b105f82bd259d261d1ef74 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:02 -0600 Subject: xfs: use b_maps[] for discontiguous buffers Commits starting at 77c1a08 introduced a multiple segment support to xfs_buf. xfs_trans_buf_item_match() could not find a multi-segment buffer in the transaction because it was looking at the single segment block number rather than the multi-segment b_maps[0].bm.bn. This results on a recursive buffer lock that can never be satisfied. This patch: 1) Changed the remaining b_map accesses to be b_maps[0] accesses. 2) Renames the single segment b_map structure to __b_map to avoid future confusion. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 12 ++++++------ fs/xfs/xfs_buf.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 26673a0b20e7..56d1614760cf 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -175,7 +175,7 @@ xfs_buf_get_maps( bp->b_map_count = map_count; if (map_count == 1) { - bp->b_maps = &bp->b_map; + bp->b_maps = &bp->__b_map; return 0; } @@ -193,7 +193,7 @@ static void xfs_buf_free_maps( struct xfs_buf *bp) { - if (bp->b_maps != &bp->b_map) { + if (bp->b_maps != &bp->__b_map) { kmem_free(bp->b_maps); bp->b_maps = NULL; } @@ -377,8 +377,8 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); @@ -640,7 +640,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -1709,7 +1709,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_map.bm_bn - bp->b_map.bm_bn; + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 23f5642480bb..433a12ed7b17 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf { struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ - struct xfs_buf_map b_map; /* inline compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ @@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp); * In future, uncached buffers will pass the block number directly to the io * request function and hence these macros will go away at that point. */ -#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { -- cgit v1.2.1 From 0f22f9d0cd8a630b40a9ccc07a8844345b185aae Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:03 -0600 Subject: xfs: rename bli_format to avoid confusion with bli_formats Rename the bli_format structure to __bli_format to avoid accidently confusing them with the bli_formats pointer. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 22 +++++++++++----------- fs/xfs/xfs_buf_item.h | 2 +- fs/xfs/xfs_trans_buf.c | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index becf4a97efc6..1975b3d9007a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -71,7 +71,7 @@ xfs_buf_item_log_debug( chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &(bip->__bli_format.blf_data_map[word_num]); bit_set = *wordp & (1 << bit_num); ASSERT(bit_set); byte++; @@ -237,7 +237,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); return bip->bli_format_count; } @@ -278,7 +278,7 @@ xfs_buf_item_format_segment( uint buffer_offset; /* copy the flags across from the base format item */ - blfp->blf_flags = bip->bli_format.blf_flags; + blfp->blf_flags = bip->__bli_format.blf_flags; /* * Base size is the actual size of the ondisk structure - it reflects @@ -371,7 +371,7 @@ xfs_buf_item_format_segment( nbits++; } } - bip->bli_format.blf_size = nvecs; + bip->__bli_format.blf_size = nvecs; return vecp; } @@ -405,7 +405,7 @@ xfs_buf_item_format( if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } @@ -485,7 +485,7 @@ xfs_buf_item_unpin( ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -631,7 +631,7 @@ xfs_buf_item_unlock( */ if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; @@ -644,8 +644,8 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. */ - if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) + if (xfs_bitmap_empty(bip->__bli_format.blf_data_map, + bip->__bli_format.blf_map_size)) xfs_buf_item_relse(bp); else atomic_dec(&bip->bli_refcount); @@ -716,7 +716,7 @@ xfs_buf_item_get_format( bip->bli_format_count = count; if (count == 1) { - bip->bli_formats = &bip->bli_format; + bip->bli_formats = &bip->__bli_format; return 0; } @@ -731,7 +731,7 @@ STATIC void xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { - if (bip->bli_formats != &bip->bli_format) { + if (bip->bli_formats != &bip->__bli_format) { kmem_free(bip->bli_formats); bip->bli_formats = NULL; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 6850f49f4af3..16def435944a 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -104,7 +104,7 @@ typedef struct xfs_buf_log_item { #endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ - struct xfs_buf_log_format bli_format; /* embedded in-log header */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 4fc17d479d42..f7510bf68284 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -93,7 +93,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_fspriv; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = bp->b_fspriv; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_brelse(bip); @@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; @@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); @@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } tp->t_flags |= XFS_TRANS_DIRTY; @@ -657,8 +657,8 @@ xfs_trans_binval( */ ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -668,10 +668,10 @@ xfs_trans_binval( bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLF_CANCEL; - memset((char *)(bip->bli_format.blf_data_map), 0, - (bip->bli_format.blf_map_size * sizeof(uint))); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + memset((char *)(bip->__bli_format.blf_data_map), 0, + (bip->__bli_format.blf_map_size * sizeof(uint))); bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -775,5 +775,5 @@ xfs_trans_dquot_buf( type == XFS_BLF_GDQUOT_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= type; + bip->__bli_format.blf_flags |= type; } -- cgit v1.2.1 From 2d0e9df579029b62adc72b50977182757cc04cd5 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:04 -0600 Subject: xfs: fix segment in xfs_buf_item_format_segment Not every segment in a multi-segment buffer is dirty in a transaction and they will not be outputted. The assert in xfs_buf_item_format_segment() that checks for the at least one chunk of data in the segment to be used is not necessary true for multi-segmented buffers. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1975b3d9007a..c48e60bd857d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -287,6 +287,17 @@ xfs_buf_item_format_segment( */ base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + + nvecs = 0; + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + goto out; + } + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; @@ -301,15 +312,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - blfp->blf_size = nvecs; - return vecp; + goto out; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); - ASSERT(first_bit != -1); + last_bit = first_bit; nbits = 1; for (;;) { @@ -371,7 +380,8 @@ xfs_buf_item_format_segment( nbits++; } } - bip->__bli_format.blf_size = nvecs; +out: + blfp->blf_size = nvecs; return vecp; } -- cgit v1.2.1 From 91e4bac0b72736410c88632906953f14259144b1 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:05 -0600 Subject: xfs: fix the multi-segment log buffer format Per Dave Chinner suggestion, this patch: 1) Corrects the detection of whether a multi-segment buffer is still tracking data. 2) Clears all the buffer log formats for a multi-segment buffer. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 13 ++++++++++--- fs/xfs/xfs_trans_buf.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index c48e60bd857d..77b09750e92c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -611,7 +611,7 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted; + int aborted, clean, i; uint hold; /* Clear the buffer's association with this transaction. */ @@ -654,8 +654,15 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. */ - if (xfs_bitmap_empty(bip->__bli_format.blf_data_map, - bip->__bli_format.blf_map_size)) + clean = 1; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = 0; + break; + } + } + if (clean) xfs_buf_item_relse(bp); else atomic_dec(&bip->bli_refcount); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index f7510bf68284..3edf5dbee001 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -643,6 +643,7 @@ xfs_trans_binval( xfs_buf_t *bp) { xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -670,8 +671,10 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; - memset((char *)(bip->__bli_format.blf_data_map), 0, - (bip->__bli_format.blf_map_size * sizeof(uint))); + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } -- cgit v1.2.1 From ab7eac22008f044631c0a3f4be344ebc2cb0e266 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 21 Dec 2012 10:45:17 -0500 Subject: xfs: remove int casts from debug dquot soft limit timer asserts The int casts here make it easy to trigger an assert with a large soft limit. For example, set a >4TB soft limit on an empty volume to reproduce a (0 > -x) comparison due to an overflow of d_blk_softlimit. Signed-off-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_qm_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5f53e75409b8..8a59f8546552 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -784,11 +784,11 @@ xfs_qm_scall_getquota( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && + if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && + if ((dst->d_icount > dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } -- cgit v1.2.1 From 37f13561de6039b3a916d1510086030d097dea0f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 10 Jan 2013 10:41:48 -0600 Subject: xfs: recalculate leaf entry pointer after compacting a dir2 block Dave Jones hit this assert when doing a compile on recent git, with CONFIG_XFS_DEBUG enabled: XFS: Assertion failed: (char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)), file: fs/xfs/xfs_dir2_data.c, line: 828 Upon further digging, the tag found by xfs_dir2_data_unused_tag_p(dup) contained "2" and not the proper offset, and I found that this value was changed after the memmoves under "Use a stale leaf for our new entry." in xfs_dir2_block_addname(), i.e. memmove(&blp[mid + 1], &blp[mid], (highstale - mid) * sizeof(*blp)); overwrote it. What has happened is that the previous call to xfs_dir2_block_compact() has rearranged things; it changes btp->count as well as the blp array. So after we make that call, we must recalculate the proper pointer to the leaf entries by making another call to xfs_dir2_block_leaf_p(). Dave provided a metadump image which led to a simple reproducer (create a particular filename in the affected directory) and this resolves the testcase as well as the bug on his live system. Thanks also to dchinner for looking at this one with me. Signed-off-by: Eric Sandeen Tested-by: Dave Jones Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 7536faaa61e7..12afe07a91d7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -355,10 +355,12 @@ xfs_dir2_block_addname( /* * If need to compact the leaf entries, do it now. */ - if (compact) + if (compact) { xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); - else if (btp->stale) { + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { /* * Set leaf logging boundaries to impossible state. * For the no-stale case they're set explicitly. -- cgit v1.2.1 From 8ce03fd76d323526a693d05d85296ef07a387a9f Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sat, 17 Nov 2012 12:45:47 +0100 Subject: cuse: use mutex as registration lock instead of spinlocks We need to check for name-collisions during cuse-device registration. To avoid race-conditions, this needs to be protected during the whole device registration. Therefore, replace the spinlocks by mutexes first so we can safely extend the locked regions to include more expensive or sleeping code paths. Signed-off-by: David Herrmann Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d55042298..048e89f25082 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include @@ -63,7 +62,7 @@ struct cuse_conn { bool unrestricted_ioctl; }; -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock); /* protects registration */ static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; static struct class *cuse_class; @@ -114,14 +113,14 @@ static int cuse_open(struct inode *inode, struct file *file) int rc; /* look up and get the connection */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_for_each_entry(pos, cuse_conntbl_head(devt), list) if (pos->dev->devt == devt) { fuse_conn_get(&pos->fc); cc = pos; break; } - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* dead? */ if (!cc) @@ -377,9 +376,9 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* announce device availability */ dev_set_uevent_suppress(dev, 0); @@ -520,9 +519,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file) int rc; /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_del_init(&cc->list); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* remove device */ if (cc->dev) -- cgit v1.2.1 From 30783587b0f318b9e2e165f34cf5dfd9425a4904 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sat, 17 Nov 2012 12:45:48 +0100 Subject: cuse: do not register multiple devices with identical names Sysfs doesn't allow two devices with the same name, but we register a sysfs entry for each cuse device without checking for name collisions. This extends the registration to first check whether the name was already registered. To avoid race-conditions between the name-check and linking the device, we need to protect the whole registration with a mutex. Signed-off-by: David Herrmann Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 048e89f25082..2a2797e2abc5 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -304,14 +304,14 @@ static void cuse_gendev_release(struct device *dev) */ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = req->out.args[0].value; struct page *page = req->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; - int rc; + int rc, i; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -355,15 +355,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_drvdata(dev, cc); dev_set_name(dev, "%s", devinfo.name); + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + rc = device_add(dev); if (rc) - goto err_device; + goto err_unlock; /* register cdev */ rc = -ENOMEM; cdev = cdev_alloc(); if (!cdev) - goto err_device; + goto err_unlock; cdev->owner = THIS_MODULE; cdev->ops = &cuse_frontend_fops; @@ -376,7 +385,6 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - mutex_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); mutex_unlock(&cuse_lock); @@ -390,7 +398,8 @@ out: err_cdev: cdev_del(cdev); -err_device: +err_unlock: + mutex_unlock(&cuse_lock); put_device(dev); err_region: unregister_chrdev_region(devt, 1); -- cgit v1.2.1 From e2560362cc2b39a0567cab510121a7e93dfbe797 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 15 Jan 2013 12:24:46 +0100 Subject: cuse: fix uninitialized variable warnings Fix the following compiler warnings: fs/fuse/cuse.c: In function 'cuse_process_init_reply': fs/fuse/cuse.c:288:24: warning: 'val' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/fuse/cuse.c:272:14: note: 'val' was declared here fs/fuse/cuse.c:284:10: warning: 'key' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/fuse/cuse.c:272:8: note: 'key' was declared here Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2a2797e2abc5..e397b675b029 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -266,7 +266,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *key, *val; + char *uninitialized_var(key), *uninitialized_var(val); int rc; while (true) { -- cgit v1.2.1 From 807185eb3e7271d7a1f2c08f3aa1b63dba547d07 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 29 Aug 2012 17:51:51 -0400 Subject: fuse: Move CUSE Kconfig entry from fs/Kconfig into fs/fuse/Kconfig Given that CUSE depends on FUSE, it only makes sense to move its Kconfig entry into the FUSE Kconfig file. Also, add a few grammatical and semantic touchups. Signed-off-by: Robert P. J. Day Signed-off-by: Miklos Szeredi --- fs/Kconfig | 10 ---------- fs/fuse/Kconfig | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index cfe512fd1caf..780725a463b1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94eda..1b2f6c2c3aaf 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS With FUSE it is possible to implement a fully functional filesystem in a userspace program. - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. -- cgit v1.2.1 From cdadb11cef1802c1b0228976f08647d276711086 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 10 Nov 2012 16:55:56 +0100 Subject: fuse: make fuse_file_fallocate() static Fix the following sparse warning: fs/fuse/file.c:2249:6: warning: symbol 'fuse_file_fallocate' was not declared. Should it be static? Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8f87e3..f3ab824fa302 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2177,8 +2177,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return ret; } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2213,7 +2213,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return err; } -EXPORT_SYMBOL_GPL(fuse_file_fallocate); static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, -- cgit v1.2.1 From 8f706111a860c026bcb0abe0c5936f59c31e5c87 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Oct 2012 22:51:25 +0800 Subject: fuse: remove unused variable in fuse_try_move_page() The variables mapping,index are initialized but never used otherwise, so remove the unused variables. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16335315e5d..e83351aa5bad 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -692,8 +692,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *oldpage = *pagep; struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - struct address_space *mapping; - pgoff_t index; unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); @@ -724,9 +722,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (fuse_check_page(newpage) != 0) goto out_fallback_unlock; - mapping = oldpage->mapping; - index = oldpage->index; - /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it -- cgit v1.2.1 From ed0fb78fb6aa294a719f8f5654fdff0ec8bc00bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: bring back balance pause/resume logic Balance pause/resume logic got broken by 5ac00add (went in into 3.8-rc1 as part of dev-replace merge). Offending commit took a stab at making mutually exclusive volume operations (add_dev, rm_dev, resize, balance, replace_dev) not block behind volume_mutex if another such operation is in progress and instead return an error right away. Balancing front-end relied on the blocking behaviour, so the fix is ugly, but short of a complete rework, it's the best we can do. Reported-by: Liu Bo Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 78 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.c | 10 ++++--- 2 files changed, 71 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 982c0b9ceea5..77d8273e394c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3440,8 +3440,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3450,14 +3450,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3477,13 +3524,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3504,11 +3548,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3516,12 +3566,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86279c37de64..9c84dbe64f18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2959,6 +2959,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3140,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3160,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3182,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3235,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); -- cgit v1.2.1 From 2c0c9da02a2c4289350da6e54202a86602c0f926 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix "mutually exclusive op is running" error code The error code that is returned in response to starting a mutually exclusive operation when there is one already running got silently changed from EINVAL to EINPROGRESS by 5ac00add. Returning EINPROGRESS to, say, add_dev, when rm_dev is running is misleading. Furthermore, the operation itself may want to use EINPROGRESS for other purposes. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 77d8273e394c..259dd52d8785 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1340,7 +1340,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2192,7 +2192,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } ret = mnt_want_write_file(file); if (ret) { @@ -2266,7 +2266,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2303,7 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); -- cgit v1.2.1 From 18f39c416d18d74ac11d157e44247253d3fa30ae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_resize Fix unlock order in btrfs_ioctl_resize(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 259dd52d8785..fcf1b1b40082 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1446,8 +1446,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.1 From 4ac20c70b0734b65662ded735e5f6ba0415bdb71 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_rm_dev Fix unlock order in btrfs_ioctl_rm_dev(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fcf1b1b40082..f5c1c150d9f3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2319,8 +2319,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.1 From 25122d15e21cf252e91e4cad7cea760f97df29f1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: reorder locks and sanity checks in btrfs_ioctl_defrag Operation-specific check (whether subvol is readonly or not) should go after the mutual exclusiveness check. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5c1c150d9f3..afbf3ac2079d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2186,19 +2186,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2250,8 +2251,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.1 From e3e2775cedc9d6294b7bc7cbe9f59c62f9472871 Mon Sep 17 00:00:00 2001 From: Nickolai Zeldovich Date: Wed, 16 Jan 2013 21:36:17 -0500 Subject: cifs: fix srcip_matches() for ipv6 srcip_matches() previously had code like this: srcip_matches(..., struct sockaddr *rhs) { /* ... */ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *) &rhs; return ipv6_addr_equal(..., &vaddr6->sin6_addr); } which interpreted the values on the stack after the 'rhs' pointer as an ipv6 address. The correct thing to do is to use 'rhs', not '&rhs'. Signed-off-by: Nickolai Zeldovich Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 17c3643e5950..12b3da39733b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; - struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); } default: -- cgit v1.2.1 From ff24858c65d9c518af41aad22fb964685351051a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:08 -0700 Subject: Btrfs: ignore orphan qgroup relations If a qgroup that has still assignments is deleted by the user, the corresponding relations are left in the tree. This leads to an unmountable filesystem. With this patch, those relations are simple ignored. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8e..28f2b39f6a25 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: -- cgit v1.2.1 From 2cf687039676c2b6e1ee96b0b89090aca94babcd Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:09 -0700 Subject: Btrfs: prevent qgroup destroy when there are still relations Currently you can just destroy a qgroup even though it is in use by other qgroups or has qgroups assigned to it. This patch prevents destruction of qgroups unless they are completely unused. Otherwise destroy will return EBUSY. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 28f2b39f6a25..a5c856234323 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -963,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; -- cgit v1.2.1 From a105bb88f46b60de2adf1ee98745bd59362b09ab Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 Jan 2013 15:15:56 +0200 Subject: Btrfs: fix a regression in balance usage filter Commit 3fed40cc ("Btrfs: cleanup duplicated division functions"), which was merged into 3.8-rc1, has introduced a regression by removing logic that was guarding us against bad user input. Bring it back. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c84dbe64f18..469609838913 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2614,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; -- cgit v1.2.1 From 6e6093a8f144414d904575da5fdea40cf14fb63e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 17 Jan 2013 00:08:30 +0900 Subject: f2fs: add __init to functions in init_f2fs_fs Add __init to functions in init_f2fs_fs for code consistency. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/gc.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d75c86a17893..ff3c8439af87 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -771,7 +771,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi) sbi->n_orphans = 0; } -int create_checkpoint_caches(void) +int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 976325d51e3d..c8e2d751ef9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -914,7 +914,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int, void flush_nat_entries(struct f2fs_sb_info *); int build_node_manager(struct f2fs_sb_info *); void destroy_node_manager(struct f2fs_sb_info *); -int create_node_manager_caches(void); +int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* @@ -966,7 +966,7 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *); void block_operations(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool, bool); void init_orphan_info(struct f2fs_sb_info *); -int create_checkpoint_caches(void); +int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* @@ -988,7 +988,7 @@ void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int create_gc_caches(void); +int __init create_gc_caches(void); void destroy_gc_caches(void); /* @@ -1060,7 +1060,7 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) @@ -1071,7 +1071,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void f2fs_create_root_stats(void) { } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b4dd90cf1f18..809cfec6683c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -697,7 +697,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) DIRTY_I(sbi)->v_ops = &default_v_ops; } -int create_gc_caches(void) +int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", sizeof(struct inode_entry), NULL); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5066bfd256c9..f177c018745c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1732,7 +1732,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int create_node_manager_caches(void) +int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry), NULL); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d551a724b736..37fad04c8669 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -640,7 +640,7 @@ static struct file_system_type f2fs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int init_inodecache(void) +static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); -- cgit v1.2.1 From 692bb55d1ab5b278181ff2e65f09eb0be6d50669 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 17 Jan 2013 18:37:41 +0900 Subject: f2fs: add remap_pages as generic_file_remap_pages This was added for all the file systems before. See the following commit. commit id: 0b173bc4daa8f8ec03a85abf5e47b23502ff80af [PATCH] mm: kill vma flag VM_CAN_NONLINEAR This patch moves actual ptes filling for non-linear file mappings into special vma operation: ->remap_pages(). File system must implement this method to get non-linear mappings support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support." Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 819de7f39f26..3191b52aafb0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -96,8 +96,9 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = f2fs_vm_page_mkwrite, + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) -- cgit v1.2.1 From c01e54b770e69c65525295eb2668be3dc0822406 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 17 Jan 2013 20:30:23 +0900 Subject: f2fs: support swapfile This patch adds f2fs_bmap operation to the data address space. This enables f2fs to support swapfile. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b1347fc6d688..7bd22a201125 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -698,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, get_data_block_ro); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -709,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, }; -- cgit v1.2.1 From a7fdffbd3ea4b3cc2993af006bde38a423b38b72 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 18 Jan 2013 14:54:13 +0900 Subject: f2fs: avoid issuing small bios due to several dirty node pages If some small bios of dirty node pages are supposed to be issued during the sequential data writes, there-in well-produced consecutive data bios are able to be split by the small node bios, resulting in performance degradation. So, let's collect a number of dirty node pages until reaching a threshold. And, by default, I set the threshold as 2MB, a segment size. This improves sequential write performance on i5, 512GB SSD (830 w/ SATA2) as follows. Before: 231 MB/s -> After: 255 MB/s Signed-off-by: Jaegeuk Kim Reviewed-by: Namjae Jeon --- fs/f2fs/node.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f177c018745c..9bda63c9c166 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1124,6 +1124,12 @@ static int f2fs_write_node_page(struct page *page, return 0; } +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1131,17 +1137,16 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) - return 0; - + /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { write_checkpoint(sbi, false, false); return 0; } + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + return 0; + /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = bio_get_nr_vecs(bdev); sync_node_pages(sbi, 0, wbc); -- cgit v1.2.1 From 9af45ef5ab8ce4a13c553200dc15509441fbd68f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Jan 2013 17:34:21 +0900 Subject: f2fs: add comments of start_bidx_of_node The caller of start_bidx_of_node() should give proper node offsets which point only direct node blocks. Otherwise, it is a caller's bug. This patch adds comments to make it clear. Reported-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 809cfec6683c..c386910dacc5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -424,7 +424,11 @@ next_step: } /* - * Calculate start block index that this node page contains + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. */ block_t start_bidx_of_node(unsigned int node_ofs) { -- cgit v1.2.1 From d8b79b2f94600262fcfbffbe3df7fd3c83c6c51b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 20 Jan 2013 18:02:58 +0300 Subject: f2fs: use _safe() version of list_for_each This is calling list_del() inside a loop which is a problem when we try move to the next item on the list. I've converted it to use the _safe version. And also, as a cleanup, I've converted it to use list_for_each_entry instead of list_for_each. Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Torokhov Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6cc046d36815..f42e4060b399 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -173,10 +173,9 @@ out: static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - struct list_head *this; - struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); -- cgit v1.2.1 From 10b8c7dff5d3633b69e77f57d404dab54ead3787 Mon Sep 17 00:00:00 2001 From: Cong Ding Date: Tue, 22 Jan 2013 19:20:58 -0500 Subject: fs/cifs/cifs_dfs_ref.c: fix potential memory leakage When it goes to error through line 144, the memory allocated to *devname is not freed, and the caller doesn't free it either in line 250. So we free the memroy of *devname in function cifs_compose_mount_options() when it goes to error. Signed-off-by: Cong Ding CC: stable Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd717bfc..210fce2df308 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; goto compose_mount_options_out; } -- cgit v1.2.1 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 13 ++++++++++++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 5 +++-- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fff2c28497b6..ed88f5ee4bea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -256,7 +260,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -281,6 +286,12 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e8..c6598c89cff8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d4..de8899b04d69 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3410,13 +3410,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3424,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); -- cgit v1.2.1 From b0175117b9376a69978bbe80af26fb95dddbd53e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 11:39:19 -0500 Subject: Btrfs: fix panic when recovering tree log A user reported a BUG_ON(ret) that occured during tree log replay. Ret was -EAGAIN, so what I think happened is that we removed an extent that covered a bitmap entry and an extent entry. We remove the part from the bitmap and return -EAGAIN and then search for the next piece we want to remove, which happens to be an entire extent entry, so we just free the sucker and return. The problem is ret is still set to -EAGAIN so we trip the BUG_ON(). The user used btrfs-zero-log so I'm not 100% sure this is what happened so I've added a WARN_ON() to catch the other possibility. Thanks, Reported-by: Jan Steffens Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c9..0be7a8742a43 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); -- cgit v1.2.1 From 192000dda22e02225772e862b92e7c09e5a17d08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 6 Jan 2013 03:38:22 +0000 Subject: Btrfs: use right range to find checksum for compressed extents For compressed extents, the range of checksum is covered by disk length, and the disk length is different with ram length, so we need to use disk length instead to get us the right checksum. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de8899b04d69..9027bb1e7466 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, -- cgit v1.2.1 From e58dd74bccb4317e39e4b675bf9c6cd133608fac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Jan 2013 15:43:09 -0500 Subject: Btrfs: put csums on the right ordered extent I noticed a WARN_ON going off when adding csums because we were going over the amount of csum bytes that should have been allowed for an ordered extent. This is a leftover from when we used to hold the csums privately for direct io, but now we use the normal ordered sum stuff so we need to make sure and check if we've moved on to another extent so that the csums are added to the right extent. Without this we could end up with csums for bytenrs that don't have extents to cover them yet. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef42358..94aa53b38721 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; -- cgit v1.2.1 From 8d25a086eb104297e3ba1fdd180b04cfaaa84797 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:27:25 +0000 Subject: Btrfs: Add ACCESS_ONCE() to transaction->abort accesses We may access and update transaction->aborted on the different CPUs without lock, so we need ACCESS_ONCE() wrapper to prevent the compiler from creating unsolicited accesses and make sure we can get the right value. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86c..d8982e9601d3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..0ef29611fade 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1468,7 +1468,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } -- cgit v1.2.1 From 2cba30f172afdfa00f3e844f42f21eb3b972d01c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:29:12 +0000 Subject: Btrfs: fix missed transaction->aborted check First, though the current transaction->aborted check can stop the commit early and avoid unnecessary operations, it is too early, and some transaction handles don't end, those handles may set transaction->aborted after the check. Second, when we commit the transaction, we will wake up some worker threads to flush the space cache and inode cache. Those threads also allocate some transaction handles and may set transaction->aborted if some serious error happens. So we need more check for ->aborted when committing the transaction. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0ef29611fade..f15494699f3b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1575,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1658,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; -- cgit v1.2.1 From c9f01bfe0ca411b4751d7fdbb9d602035ba52f75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jan 2013 11:27:17 +0000 Subject: Btrfs: fix wrong max device number for single profile The max device number of single profile is 1, not 0 (0 means 'as many as possible'). Fix it. Cc: Liu Bo Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 469609838913..15f6efdf6463 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3507,7 +3507,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -- cgit v1.2.1 From 1eafa6c73791e4f312324ddad9cbcaf6a1b6052b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:49:00 +0000 Subject: Btrfs: fix repeated delalloc work allocation btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ btrfs_queue_worker for this inode, and then it locks the list, checks the head of the list again. But because we don't delete the first inode that it deals with before, it will fetch the same inode. As a result, this function allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. Fix this problem by splice this delalloc list. Reported-by: Alex Lyakas Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9bc6c40b182d..ca7ace7b7b52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7585,41 +7585,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7632,11 +7652,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } -- cgit v1.2.1 From d4e0bfec9b6fbb9b58640b44e01bb74ae0d29b22 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 3 Jan 2013 17:52:07 -0500 Subject: GFS2: fix skip unlock condition The recent commit fb6791d100d1bba20b5cdbc4912e1f7086ec60f8 included the wrong logic. The lvbptr check was incorrectly added after the patch was tested. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/lock_dlm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b906ed17a839..9802de0f85e6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { + !lvb_needs_unlock) { gfs2_glock_free(gl); return; } -- cgit v1.2.1