From da53be12bbb4fabbe2e9f6f908de0cf478b5161d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 May 2013 15:22:44 -0700 Subject: Don't pass inode to ->d_hash() and ->d_compare() Instances either don't look at it at all (the majority of cases) or only want it to find the superblock (which can be had as dentry->d_sb). A few cases that want more are actually safe with dentry->d_inode - the only precaution needed is the check that it hadn't been replaced with NULL by rmdir() or by overwriting rename(), which case should be simply treated as cache miss. Signed-off-by: Linus Torvalds Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 6 ++---- Documentation/filesystems/vfs.txt | 19 ++++++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index bdd82b2339d9..f94a362f408e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -11,10 +11,8 @@ be able to use diff(1). prototypes: int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, const struct inode *, - struct qstr *); - int (*d_compare)(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 4a35f6614a66..51ba44e3fc40 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -901,10 +901,8 @@ defined: struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, const struct inode *, - struct qstr *); - int (*d_compare)(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); @@ -949,25 +947,24 @@ struct dentry_operations { d_hash: called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is - to be hashed into. The inode is the dentry's inode. + to be hashed into. Same locking and synchronisation rules as d_compare regarding what is safe to dereference etc. d_compare: called to compare a dentry name with a given name. The first dentry is the parent of the dentry to be compared, the second is - the parent's inode, then the dentry and inode (may be NULL) of the - child dentry. len and name string are properties of the dentry to be - compared. qstr is the name to compare it with. + the child dentry. len and name string are properties of the dentry + to be compared. qstr is the name to compare it with. Must be constant and idempotent, and should not take locks if - possible, and should not or store into the dentry or inodes. - Should not dereference pointers outside the dentry or inodes without + possible, and should not or store into the dentry. + Should not dereference pointers outside the dentry without lots of care (eg. d_parent, d_inode, d_name should not be used). However, our vfsmount is pinned, and RCU held, so the dentries and inodes won't disappear, neither will our sb or filesystem module. - ->i_sb and ->d_sb may be used. + ->d_sb may be used. It is a tricky calling convention because it needs to be called under "rcu-walk", ie. without any locks or references on things. -- cgit v1.2.3 From 1c8c601a8c0dc59fe64907dcd9d512a3d181ddc7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Jun 2013 08:58:15 -0400 Subject: locks: protect most of the file_lock handling with i_lock Having a global lock that protects all of this code is a clear scalability problem. Instead of doing that, move most of the code to be protected by the i_lock instead. The exceptions are the global lists that the ->fl_link sits on, and the ->fl_block list. ->fl_link is what connects these structures to the global lists, so we must ensure that we hold those locks when iterating over or updating these lists. Furthermore, sound deadlock detection requires that we hold the blocked_list state steady while checking for loops. We also must ensure that the search and update to the list are atomic. For the checking and insertion side of the blocked_list, push the acquisition of the global lock into __posix_lock_file and ensure that checking and update of the blocked_list is done without dropping the lock in between. On the removal side, when waking up blocked lock waiters, take the global lock before walking the blocked list and dequeue the waiters from the global list prior to removal from the fl_block list. With this, deadlock detection should be race free while we minimize excessive file_lock_lock thrashing. Finally, in order to avoid a lock inversion problem when handling /proc/locks output we must ensure that manipulations of the fl_block list are also protected by the file_lock_lock. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 21 +++-- fs/afs/flock.c | 7 +- fs/ceph/locks.c | 2 +- fs/ceph/mds_client.c | 8 +- fs/cifs/cifsfs.c | 2 +- fs/cifs/file.c | 13 +-- fs/gfs2/file.c | 2 +- fs/lockd/svcsubs.c | 12 +-- fs/locks.c | 164 ++++++++++++++++++++++++-------------- fs/nfs/delegation.c | 10 +-- fs/nfs/nfs4state.c | 8 +- fs/nfsd/nfs4state.c | 8 +- include/linux/fs.h | 11 --- 13 files changed, 155 insertions(+), 113 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f94a362f408e..c2963a74fbc3 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -342,7 +342,7 @@ prototypes: locking rules: - file_lock_lock may block + inode->i_lock may block fl_copy_lock: yes no fl_release_private: maybe no @@ -355,12 +355,19 @@ prototypes: int (*lm_change)(struct file_lock **, int); locking rules: - file_lock_lock may block -lm_compare_owner: yes no -lm_notify: yes no -lm_grant: no no -lm_break: yes no -lm_change yes no + + inode->i_lock file_lock_lock may block +lm_compare_owner: yes[1] maybe no +lm_notify: yes yes no +lm_grant: no no no +lm_break: yes no no +lm_change yes no no + +[1]: ->lm_compare_owner is generally called with *an* inode->i_lock held. It +may not be the i_lock of the inode for either file_lock being compared! This is +the case with deadlock detection, since the code has to chase down the owners +of locks that may be entirely unrelated to the one on which the lock is being +acquired. When doing a search for deadlocks, the file_lock_lock is also held. --------------------------- buffer_head ----------------------------------- prototypes: diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 2497bf306c70..a8cf2cff836c 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; struct key *key = file->private_data; int ret; @@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - lock_flocks(); + spin_lock(&inode->i_lock); /* make sure we've got a callback on this file and that our view of the * data version is up to date */ @@ -420,7 +421,7 @@ given_lock: afs_vnode_fetch_status(vnode, NULL, key); error: - unlock_flocks(); + spin_unlock(&inode->i_lock); _leave(" = %d", ret); return ret; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ebbf680378e2..690f73f42425 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -192,7 +192,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) /** * Encode the flock and fcntl locks for the given inode into the ceph_filelock - * array. Must be called with lock_flocks() already held. + * array. Must be called with inode->i_lock already held. * If we encounter more of a specific lock type than expected, return -ENOSPC. */ int ceph_encode_locks_to_buffer(struct inode *inode, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4d2920304be8..74fd2898b2ab 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2481,20 +2481,20 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_filelock *flocks; encode_again: - lock_flocks(); + spin_lock(&inode->i_lock); ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - unlock_flocks(); + spin_unlock(&inode->i_lock); flocks = kmalloc((num_fcntl_locks+num_flock_locks) * sizeof(struct ceph_filelock), GFP_NOFS); if (!flocks) { err = -ENOMEM; goto out_free; } - lock_flocks(); + spin_lock(&inode->i_lock); err = ceph_encode_locks_to_buffer(inode, flocks, num_fcntl_locks, num_flock_locks); - unlock_flocks(); + spin_unlock(&inode->i_lock); if (err) { kfree(flocks); if (err == -ENOSPC) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 540c1ccfcdb2..a445e71746fa 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -765,7 +765,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { - /* note that this is called by vfs setlease with lock_flocks held + /* note that this is called by vfs setlease with i_lock held to protect *lease from going away */ struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1686e4085646..0630710a9c3f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1092,6 +1092,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { + struct inode *inode = cfile->dentry->d_inode; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; unsigned int count = 0, i = 0; @@ -1102,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { if ((*before)->fl_flags & FL_POSIX) count++; } - unlock_flocks(); + spin_unlock(&inode->i_lock); INIT_LIST_HEAD(&locks_to_send); @@ -1126,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } el = locks_to_send.next; - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { flock = *before; if ((flock->fl_flags & FL_POSIX) == 0) continue; @@ -1152,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) lck->offset = flock->fl_start; el = el->next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { int stored_rc; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b3333371aebb..cebfd404c1d9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -889,7 +889,7 @@ out_uninit: * cluster; until we do, disable leases (by just returning -EINVAL), * unless the administrator has requested purely local locking. * - * Locking: called under lock_flocks + * Locking: called under i_lock * * Returns: errno */ diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 97e87415b145..dc5c75930f0f 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; - lock_flocks(); /* protects i_flock list */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -181,7 +181,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; - unlock_flocks(); + spin_unlock(&inode->i_lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -193,7 +193,7 @@ again: goto again; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return 0; } @@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file) if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops == &nlmsvc_lock_operations) { - unlock_flocks(); + spin_unlock(&inode->i_lock); return 1; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); file->f_locks = 0; return 0; } diff --git a/fs/locks.c b/fs/locks.c index 89d898bce166..ce302d43822b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -153,27 +153,37 @@ int lease_break_time = 45; #define for_each_lock(inode, lockp) \ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) -/* The global file_lock_list is only used for displaying /proc/locks. */ +/* + * The global file_lock_list is only used for displaying /proc/locks. Protected + * by the file_lock_lock. + */ static LIST_HEAD(file_lock_list); -/* The blocked_list is used to find POSIX lock loops for deadlock detection. */ +/* + * The blocked_list is used to find POSIX lock loops for deadlock detection. + * Protected by file_lock_lock. + */ static LIST_HEAD(blocked_list); -/* Protects the two list heads above, plus the inode->i_flock list */ +/* + * This lock protects the blocked_list, and the file_lock_list. Generally, if + * you're accessing one of those lists, you want to be holding this lock. + * + * In addition, it also protects the fl->fl_block list, and the fl->fl_next + * pointer for file_lock structures that are acting as lock requests (in + * contrast to those that are acting as records of acquired locks). + * + * Note that when we acquire this lock in order to change the above fields, + * we often hold the i_lock as well. In certain cases, when reading the fields + * protected by this lock, we can skip acquiring it iff we already hold the + * i_lock. + * + * In particular, adding an entry to the fl_block list requires that you hold + * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting + * an entry from the list however only requires the file_lock_lock. + */ static DEFINE_SPINLOCK(file_lock_lock); -void lock_flocks(void) -{ - spin_lock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(lock_flocks); - -void unlock_flocks(void) -{ - spin_unlock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(unlock_flocks); - static struct kmem_cache *filelock_cache __read_mostly; static void locks_init_lock_heads(struct file_lock *fl) @@ -489,13 +499,17 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) static inline void locks_insert_global_locks(struct file_lock *fl) { + spin_lock(&file_lock_lock); list_add_tail(&fl->fl_link, &file_lock_list); + spin_unlock(&file_lock_lock); } static inline void locks_delete_global_locks(struct file_lock *fl) { + spin_lock(&file_lock_lock); list_del_init(&fl->fl_link); + spin_unlock(&file_lock_lock); } static inline void @@ -512,6 +526,8 @@ locks_delete_global_blocked(struct file_lock *waiter) /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. + * + * Must be called with file_lock_lock held. */ static void __locks_delete_block(struct file_lock *waiter) { @@ -520,37 +536,47 @@ static void __locks_delete_block(struct file_lock *waiter) waiter->fl_next = NULL; } -/* - */ static void locks_delete_block(struct file_lock *waiter) { - lock_flocks(); + spin_lock(&file_lock_lock); __locks_delete_block(waiter); - unlock_flocks(); + spin_unlock(&file_lock_lock); } /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. + * + * Must be called with file_lock_lock held! */ -static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter) +static void __locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) { BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); if (IS_POSIX(blocker)) - locks_insert_global_blocked(request); + locks_insert_global_blocked(waiter); +} + +/* Must be called with i_lock held. */ +static void locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + spin_lock(&file_lock_lock); + __locks_insert_block(blocker, waiter); + spin_unlock(&file_lock_lock); } /* * Wake up processes blocked waiting for blocker. * - * Must be called with the file_lock_lock held! + * Must be called with the inode->i_lock held! */ static void locks_wake_up_blocks(struct file_lock *blocker) { + spin_lock(&file_lock_lock); while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter; @@ -562,10 +588,13 @@ static void locks_wake_up_blocks(struct file_lock *blocker) else wake_up(&waiter->fl_wait); } + spin_unlock(&file_lock_lock); } /* Insert file lock fl into an inode's lock list at the position indicated * by pos. At the same time add the lock to the global file lock list. + * + * Must be called with the i_lock held! */ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) { @@ -583,6 +612,8 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) * Wake up processes that are blocked waiting for this lock, * notify the FS that the lock has been cleared and * finally free the lock. + * + * Must be called with the i_lock held! */ static void locks_delete_lock(struct file_lock **thisfl_p) { @@ -652,8 +683,9 @@ void posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; + struct inode *inode = file_inode(filp); - lock_flocks(); + spin_lock(&inode->i_lock); for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; @@ -666,7 +698,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else fl->fl_type = F_UNLCK; - unlock_flocks(); + spin_unlock(&inode->i_lock); return; } EXPORT_SYMBOL(posix_test_lock); @@ -710,6 +742,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) return NULL; } +/* Must be called with the file_lock_lock held! */ static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { @@ -745,7 +778,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) return -ENOMEM; } - lock_flocks(); + spin_lock(&inode->i_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -775,9 +808,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) { - unlock_flocks(); + spin_unlock(&inode->i_lock); cond_resched(); - lock_flocks(); + spin_lock(&inode->i_lock); } find_conflict: @@ -804,7 +837,7 @@ find_conflict: error = 0; out: - unlock_flocks(); + spin_unlock(&inode->i_lock); if (new_fl) locks_free_lock(new_fl); return error; @@ -834,7 +867,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str new_fl2 = locks_alloc_lock(); } - lock_flocks(); + spin_lock(&inode->i_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If * there are any, either return error or put the request on the @@ -852,11 +885,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; + /* + * Deadlock detection and insertion into the blocked + * locks list must be done while holding the same lock! + */ error = -EDEADLK; - if (posix_locks_deadlock(request, fl)) - goto out; - error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request); + spin_lock(&file_lock_lock); + if (likely(!posix_locks_deadlock(request, fl))) { + error = FILE_LOCK_DEFERRED; + __locks_insert_block(fl, request); + } + spin_unlock(&file_lock_lock); goto out; } } @@ -1006,7 +1045,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_wake_up_blocks(left); } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); /* * Free any unused locks. */ @@ -1081,14 +1120,14 @@ int locks_mandatory_locked(struct inode *inode) /* * Search the lock list for this inode for any POSIX locks. */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; if (fl->fl_owner != owner) break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return fl ? -EAGAIN : 0; } @@ -1231,7 +1270,7 @@ int __break_lease(struct inode *inode, unsigned int mode) if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(inode); @@ -1281,11 +1320,11 @@ restart: break_time++; } locks_insert_block(flock, new_fl); - unlock_flocks(); + spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); - lock_flocks(); - __locks_delete_block(new_fl); + spin_lock(&inode->i_lock); + locks_delete_block(new_fl); if (error >= 0) { if (error == 0) time_out_leases(inode); @@ -1302,7 +1341,7 @@ restart: } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(new_fl); return error; } @@ -1355,9 +1394,10 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; + struct inode *inode = file_inode(filp); int type = F_UNLCK; - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(file_inode(filp)); for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { @@ -1366,7 +1406,7 @@ int fcntl_getlease(struct file *filp) break; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return type; } @@ -1460,7 +1500,7 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). * - * Called with file_lock_lock held. + * Called with inode->i_lock held. */ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { @@ -1529,11 +1569,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) { + struct inode *inode = file_inode(filp); int error; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, lease); - unlock_flocks(); + spin_unlock(&inode->i_lock); return error; } @@ -1551,6 +1592,7 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { struct file_lock *fl, *ret; + struct inode *inode = file_inode(filp); struct fasync_struct *new; int error; @@ -1564,10 +1606,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) return -ENOMEM; } ret = fl; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, &ret); if (error) { - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(fl); goto out_free_fasync; } @@ -1584,7 +1626,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) new = NULL; error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - unlock_flocks(); + spin_unlock(&inode->i_lock); out_free_fasync: if (new) @@ -2108,7 +2150,7 @@ void locks_remove_flock(struct file *filp) fl.fl_ops->fl_release_private(&fl); } - lock_flocks(); + spin_lock(&inode->i_lock); before = &inode->i_flock; while ((fl = *before) != NULL) { @@ -2126,7 +2168,7 @@ void locks_remove_flock(struct file *filp) } before = &fl->fl_next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); } /** @@ -2140,12 +2182,12 @@ posix_unblock_lock(struct file_lock *waiter) { int status = 0; - lock_flocks(); + spin_lock(&file_lock_lock); if (waiter->fl_next) __locks_delete_block(waiter); else status = -ENOENT; - unlock_flocks(); + spin_unlock(&file_lock_lock); return status; } EXPORT_SYMBOL(posix_unblock_lock); @@ -2259,7 +2301,7 @@ static void *locks_start(struct seq_file *f, loff_t *pos) { loff_t *p = f->private; - lock_flocks(); + spin_lock(&file_lock_lock); *p = (*pos + 1); return seq_list_start(&file_lock_list, *pos); } @@ -2273,7 +2315,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) static void locks_stop(struct seq_file *f, void *v) { - unlock_flocks(); + spin_unlock(&file_lock_lock); } static const struct seq_operations locks_seq_operations = { @@ -2320,7 +2362,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if (fl->fl_type == F_RDLCK) @@ -2337,7 +2380,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } @@ -2360,7 +2403,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if ((fl->fl_end < start) || (fl->fl_start > (start + len))) @@ -2375,7 +2419,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 57db3244f4d9..7ec4814e298d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ if (inode->i_flock == NULL) goto out; - /* Protect inode->i_flock using the file locks lock */ - lock_flocks(); + /* Protect inode->i_flock using the i_lock */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file) != ctx) continue; - unlock_flocks(); + spin_unlock(&inode->i_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1fab140764c4..ff10b4aa534c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1373,13 +1373,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* Guard against delegation returns and new lock/unlock calls */ down_write(&nfsi->rwsem); /* Protect inode->i_flock using the BKL */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file)->state != state) continue; - unlock_flocks(); + spin_unlock(&inode->i_lock); status = ops->recover_lock(state, fl); switch (status) { case 0: @@ -1406,9 +1406,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* kill_proc(fl->fl_pid, SIGLOST, 1); */ status = 0; } - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: up_write(&nfsi->rwsem); return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 316ec843dec2..f17051838b41 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2645,13 +2645,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); - /* only place dl_time is set. protected by lock_flocks*/ + /* Only place dl_time is set; protected by i_lock: */ dp->dl_time = get_seconds(); nfsd4_cb_recall(dp); } -/* Called from break_lease() with lock_flocks() held. */ +/* Called from break_lease() with i_lock held. */ static void nfsd_break_deleg_cb(struct file_lock *fl) { struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; @@ -4520,7 +4520,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) struct inode *inode = filp->fi_inode; int status = 0; - lock_flocks(); + spin_lock(&inode->i_lock); for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { if ((*flpp)->fl_owner == (fl_owner_t)lowner) { status = 1; @@ -4528,7 +4528,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) } } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); return status; } diff --git a/include/linux/fs.h b/include/linux/fs.h index ed9fdaaf3223..24fe998795e1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1024,8 +1024,6 @@ extern int vfs_setlease(struct file *, long, struct file_lock **); extern int lease_modify(struct file_lock **, int); extern int lock_may_read(struct inode *, loff_t start, unsigned long count); extern int lock_may_write(struct inode *, loff_t start, unsigned long count); -extern void lock_flocks(void); -extern void unlock_flocks(void); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, struct flock __user *user) { @@ -1166,15 +1164,6 @@ static inline int lock_may_write(struct inode *inode, loff_t start, { return 1; } - -static inline void lock_flocks(void) -{ -} - -static inline void unlock_flocks(void) -{ -} - #endif /* !CONFIG_FILE_LOCKING */ -- cgit v1.2.3 From 3999e49364193f7dbbba66e2be655fe91ba1fced Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Jun 2013 08:58:19 -0400 Subject: locks: add a new "lm_owner_key" lock operation Currently, the hashing that the locking code uses to add these values to the blocked_hash is simply calculated using fl_owner field. That's valid in most cases except for server-side lockd, which validates the owner of a lock based on fl_owner and fl_pid. In the case where you have a small number of NFS clients doing a lot of locking between different processes, you could end up with all the blocked requests sitting in a very small number of hash buckets. Add a new lm_owner_key operation to the lock_manager_operations that will generate an unsigned long to use as the key in the hashtable. That function is only implemented for server-side lockd, and simply XORs the fl_owner and fl_pid. Signed-off-by: Jeff Layton Acked-by: J. Bruce Fields Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 16 +++++++++++----- fs/lockd/svclock.c | 12 ++++++++++++ fs/locks.c | 12 ++++++++++-- include/linux/fs.h | 1 + 4 files changed, 34 insertions(+), 7 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index c2963a74fbc3..2db7c9e492e9 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -349,6 +349,7 @@ fl_release_private: maybe no ----------------------- lock_manager_operations --------------------------- prototypes: int (*lm_compare_owner)(struct file_lock *, struct file_lock *); + unsigned long (*lm_owner_key)(struct file_lock *); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, struct file_lock *, int); void (*lm_break)(struct file_lock *); /* break_lease callback */ @@ -358,16 +359,21 @@ locking rules: inode->i_lock file_lock_lock may block lm_compare_owner: yes[1] maybe no +lm_owner_key yes[1] yes no lm_notify: yes yes no lm_grant: no no no lm_break: yes no no lm_change yes no no -[1]: ->lm_compare_owner is generally called with *an* inode->i_lock held. It -may not be the i_lock of the inode for either file_lock being compared! This is -the case with deadlock detection, since the code has to chase down the owners -of locks that may be entirely unrelated to the one on which the lock is being -acquired. When doing a search for deadlocks, the file_lock_lock is also held. +[1]: ->lm_compare_owner and ->lm_owner_key are generally called with +*an* inode->i_lock held. It may not be the i_lock of the inode +associated with either file_lock argument! This is the case with deadlock +detection, since the code has to chase down the owners of locks that may +be entirely unrelated to the one on which the lock is being acquired. +For deadlock detection however, the file_lock_lock is also held. The +fact that these locks are held ensures that the file_locks do not +disappear out from under you while doing the comparison or generating an +owner key. --------------------------- buffer_head ----------------------------------- prototypes: diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index a469098682c4..067778b0ccc9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; } +/* + * Since NLM uses two "keys" for tracking locks, we need to hash them down + * to one for the blocked_hash. Here, we're just xor'ing the host address + * with the pid in order to create a key value for picking a hash bucket. + */ +static unsigned long +nlmsvc_owner_key(struct file_lock *fl) +{ + return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid; +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_compare_owner = nlmsvc_same_owner, + .lm_owner_key = nlmsvc_owner_key, .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, }; diff --git a/fs/locks.c b/fs/locks.c index 71d847cbbb63..6242e0b1c69c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -521,10 +521,18 @@ locks_delete_global_locks(struct file_lock *fl) spin_unlock(&file_lock_lock); } +static unsigned long +posix_owner_key(struct file_lock *fl) +{ + if (fl->fl_lmops && fl->fl_lmops->lm_owner_key) + return fl->fl_lmops->lm_owner_key(fl); + return (unsigned long)fl->fl_owner; +} + static inline void locks_insert_global_blocked(struct file_lock *waiter) { - hash_add(blocked_hash, &waiter->fl_link, (unsigned long)waiter->fl_owner); + hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); } static inline void @@ -757,7 +765,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) { struct file_lock *fl; - hash_for_each_possible(blocked_hash, fl, fl_link, (unsigned long)block_fl->fl_owner) { + hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { if (posix_same_owner(fl, block_fl)) return fl->fl_next; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fab064a3b65f..a137a73fc1fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -908,6 +908,7 @@ struct file_lock_operations { struct lock_manager_operations { int (*lm_compare_owner)(struct file_lock *, struct file_lock *); + unsigned long (*lm_owner_key)(struct file_lock *); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, struct file_lock *, int); void (*lm_break)(struct file_lock *); -- cgit v1.2.3 From 7b2296afb392bc21a50f42e7c7f4b19d3fea8c6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Jun 2013 08:58:20 -0400 Subject: locks: give the blocked_hash its own spinlock There's no reason we have to protect the blocked_hash and file_lock_list with the same spinlock. With the tests I have, breaking it in two gives a barely measurable performance benefit, but it seems reasonable to make this locking as granular as possible. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 16 +++++++-------- fs/locks.c | 41 +++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 27 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2db7c9e492e9..7d9ca7a83fcc 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -357,20 +357,20 @@ prototypes: locking rules: - inode->i_lock file_lock_lock may block -lm_compare_owner: yes[1] maybe no -lm_owner_key yes[1] yes no -lm_notify: yes yes no -lm_grant: no no no -lm_break: yes no no -lm_change yes no no + inode->i_lock blocked_lock_lock may block +lm_compare_owner: yes[1] maybe no +lm_owner_key yes[1] yes no +lm_notify: yes yes no +lm_grant: no no no +lm_break: yes no no +lm_change yes no no [1]: ->lm_compare_owner and ->lm_owner_key are generally called with *an* inode->i_lock held. It may not be the i_lock of the inode associated with either file_lock argument! This is the case with deadlock detection, since the code has to chase down the owners of locks that may be entirely unrelated to the one on which the lock is being acquired. -For deadlock detection however, the file_lock_lock is also held. The +For deadlock detection however, the blocked_lock_lock is also held. The fact that these locks are held ensures that the file_locks do not disappear out from under you while doing the comparison or generating an owner key. diff --git a/fs/locks.c b/fs/locks.c index 6242e0b1c69c..04e2c1fdb157 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -159,10 +159,11 @@ int lease_break_time = 45; * by the file_lock_lock. */ static HLIST_HEAD(file_lock_list); +static DEFINE_SPINLOCK(file_lock_lock); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. - * It is protected by file_lock_lock. + * It is protected by blocked_lock_lock. * * We hash locks by lockowner in order to optimize searching for the lock a * particular lockowner is waiting on. @@ -175,8 +176,8 @@ static HLIST_HEAD(file_lock_list); static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); /* - * This lock protects the blocked_hash and the file_lock_list. Generally, if - * you're accessing one of those lists, you want to be holding this lock. + * This lock protects the blocked_hash. Generally, if you're accessing it, you + * want to be holding this lock. * * In addition, it also protects the fl->fl_block list, and the fl->fl_next * pointer for file_lock structures that are acting as lock requests (in @@ -191,7 +192,7 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting * an entry from the list however only requires the file_lock_lock. */ -static DEFINE_SPINLOCK(file_lock_lock); +static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *filelock_cache __read_mostly; @@ -544,7 +545,7 @@ locks_delete_global_blocked(struct file_lock *waiter) /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. * - * Must be called with file_lock_lock held. + * Must be called with blocked_lock_lock held. */ static void __locks_delete_block(struct file_lock *waiter) { @@ -555,9 +556,9 @@ static void __locks_delete_block(struct file_lock *waiter) static void locks_delete_block(struct file_lock *waiter) { - spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); __locks_delete_block(waiter); - spin_unlock(&file_lock_lock); + spin_unlock(&blocked_lock_lock); } /* Insert waiter into blocker's block list. @@ -565,9 +566,9 @@ static void locks_delete_block(struct file_lock *waiter) * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. * - * Must be called with both the i_lock and file_lock_lock held. The fl_block + * Must be called with both the i_lock and blocked_lock_lock held. The fl_block * list itself is protected by the file_lock_list, but by ensuring that the - * i_lock is also held on insertions we can avoid taking the file_lock_lock + * i_lock is also held on insertions we can avoid taking the blocked_lock_lock * in some cases when we see that the fl_block list is empty. */ static void __locks_insert_block(struct file_lock *blocker, @@ -584,9 +585,9 @@ static void __locks_insert_block(struct file_lock *blocker, static void locks_insert_block(struct file_lock *blocker, struct file_lock *waiter) { - spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); __locks_insert_block(blocker, waiter); - spin_unlock(&file_lock_lock); + spin_unlock(&blocked_lock_lock); } /* @@ -601,12 +602,12 @@ static void locks_wake_up_blocks(struct file_lock *blocker) * blocked requests are only added to the list under the i_lock, and * the i_lock is always held here. Note that removal from the fl_block * list does not require the i_lock, so we must recheck list_empty() - * after acquiring the file_lock_lock. + * after acquiring the blocked_lock_lock. */ if (list_empty(&blocker->fl_block)) return; - spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter; @@ -618,7 +619,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker) else wake_up(&waiter->fl_wait); } - spin_unlock(&file_lock_lock); + spin_unlock(&blocked_lock_lock); } /* Insert file lock fl into an inode's lock list at the position indicated @@ -772,7 +773,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) return NULL; } -/* Must be called with the file_lock_lock held! */ +/* Must be called with the blocked_lock_lock held! */ static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { @@ -920,12 +921,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str * locks list must be done while holding the same lock! */ error = -EDEADLK; - spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; __locks_insert_block(fl, request); } - spin_unlock(&file_lock_lock); + spin_unlock(&blocked_lock_lock); goto out; } } @@ -2212,12 +2213,12 @@ posix_unblock_lock(struct file_lock *waiter) { int status = 0; - spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); if (waiter->fl_next) __locks_delete_block(waiter); else status = -ENOENT; - spin_unlock(&file_lock_lock); + spin_unlock(&blocked_lock_lock); return status; } EXPORT_SYMBOL(posix_unblock_lock); @@ -2332,6 +2333,7 @@ static void *locks_start(struct seq_file *f, loff_t *pos) loff_t *p = f->private; spin_lock(&file_lock_lock); + spin_lock(&blocked_lock_lock); *p = (*pos + 1); return seq_hlist_start(&file_lock_list, *pos); } @@ -2345,6 +2347,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) static void locks_stop(struct seq_file *f, void *v) { + spin_unlock(&blocked_lock_lock); spin_unlock(&file_lock_lock); } -- cgit v1.2.3 From 48bde8d3620f5f3c6ae9ff599eb404055ae51664 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 3 Jul 2013 16:19:23 +0400 Subject: Document ->tmpfile() Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/vfs.txt | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 7d9ca7a83fcc..e95d3131309e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -64,6 +64,7 @@ prototypes: int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); + int (*tmpfile) (struct inode *, struct dentry *, umode_t); locking rules: all may block @@ -91,6 +92,7 @@ removexattr: yes fiemap: no update_time: no atomic_open: yes +tmpfile: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 51ba44e3fc40..aeff462c7228 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -360,6 +360,8 @@ struct inode_operations { int (*removexattr) (struct dentry *, const char *); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, + int (*tmpfile) (struct inode *, struct dentry *, umode_t); +} ____cacheline_aligned; struct file *, unsigned open_flag, umode_t create_mode, int *opened); }; @@ -472,6 +474,9 @@ otherwise noted. component is negative or needs lookup. Cached positive dentries are still handled by f_op->open(). + tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to + atomically creating, opening and unlinking a file in given directory. + The Address Space Object ======================== -- cgit v1.2.3