From dcfcf20512cb517ac18b9433b676183fa1257911 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 23 Dec 2010 11:57:13 +1100 Subject: xfs: provide a inode iolock lockdep class The XFS iolock needs to be re-initialised to a new lock class before it enters reclaim to prevent lockdep false positives. Unfortunately, this is not sufficient protection as inodes in the XFS_IRECLAIMABLE state can be recycled and not re-initialised before being reused. We need to re-initialise the lock state when transfering out of XFS_IRECLAIMABLE state to XFS_INEW, but we need to keep the same class as if the inode was just allocated. Hence we need a specific lockdep class variable for the iolock so that both initialisations use the same class. While there, add a specific class for inodes in the reclaim state so that it is easy to tell from lockdep reports what state the inode was in that generated the report. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iget.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs/xfs/xfs_iget.c') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0cdd26932d8e..cdb1c2505fc6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -42,6 +42,17 @@ #include "xfs_trace.h" +/* + * Define xfs inode iolock lockdep classes. We need to ensure that all active + * inodes are considered the same for lockdep purposes, including inodes that + * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to + * guarantee the locks are considered the same when there are multiple lock + * initialisation siteѕ. Also, define a reclaimable inode class so it is + * obvious in lockdep reports which class the report is against. + */ +static struct lock_class_key xfs_iolock_active; +struct lock_class_key xfs_iolock_reclaimable; + /* * Allocate and initialise an xfs_inode. */ @@ -71,6 +82,8 @@ xfs_inode_alloc( ASSERT(completion_done(&ip->i_flush)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); /* initialise the xfs inode */ ip->i_ino = ino; @@ -218,6 +231,12 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_INEW; __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); + spin_unlock(&ip->i_flags_lock); write_unlock(&pag->pag_ici_lock); } else { -- cgit v1.2.1 From d95b7aaf9ab6738bef1ebcc52ab66563085e44ac Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Dec 2010 16:41:39 +1100 Subject: xfs: rcu free inodes Introduce RCU freeing of XFS inodes so that we can convert lookup traversals to use rcu_read_lock() protection. This patch only introduces the RCU freeing to minimise the potential conflicts with mainline if this is merged into mainline via a VFS patchset. It abuses the i_dentry list for the RCU callback structure because the VFS patches make this a union so it is safe to use like this and simplifies and merge issues. This patch uses basic RCU freeing rather than SLAB_DESTROY_BY_RCU. The later lookup patches need the same "found free inode" protection regardless of the RCU freeing method used, so once again the RCU freeing method can be dealt with apprpriately at merge time without affecting any other code. Signed-off-by: Dave Chinner Reviewed-by: Paul E. McKenney --- fs/xfs/xfs_iget.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_iget.c') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index cdb1c2505fc6..9fae47556604 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -104,6 +104,18 @@ xfs_inode_alloc( return ip; } +void +__xfs_inode_free( + struct rcu_head *head) +{ + struct inode *inode = container_of((void *)head, + struct inode, i_dentry); + struct xfs_inode *ip = XFS_I(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_zone_free(xfs_inode_zone, ip); +} + void xfs_inode_free( struct xfs_inode *ip) @@ -147,7 +159,7 @@ xfs_inode_free( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free); } /* -- cgit v1.2.1 From 1a3e8f3da09c7082d25b512a0ffe569391e4c09a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 17 Dec 2010 17:29:43 +1100 Subject: xfs: convert inode cache lookups to use RCU locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With delayed logging greatly increasing the sustained parallelism of inode operations, the inode cache locking is showing significant read vs write contention when inode reclaim runs at the same time as lookups. There is also a lot more write lock acquistions than there are read locks (4:1 ratio) so the read locking is not really buying us much in the way of parallelism. To avoid the read vs write contention, change the cache to use RCU locking on the read side. To avoid needing to RCU free every single inode, use the built in slab RCU freeing mechanism. This requires us to be able to detect lookups of freed inodes, so enѕure that ever freed inode has an inode number of zero and the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit lookup path, but also add a check for a zero inode number as well. We canthen convert all the read locking lockups to use RCU read side locking and hence remove all read side locking. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/xfs_iget.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'fs/xfs/xfs_iget.c') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 9fae47556604..04ed09b907b8 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -80,6 +80,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); lockdep_set_class_and_name(&ip->i_iolock.mr_lock, @@ -98,9 +99,6 @@ xfs_inode_alloc( ip->i_size = 0; ip->i_new_size = 0; - /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW; - return ip; } @@ -159,6 +157,16 @@ xfs_inode_free( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free); } @@ -169,14 +177,29 @@ static int xfs_iget_cache_hit( struct xfs_perag *pag, struct xfs_inode *ip, + xfs_ino_t ino, int flags, - int lock_flags) __releases(pag->pag_ici_lock) + int lock_flags) __releases(RCU) { struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; int error; + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + /* * If we are racing with another cache hit that is currently @@ -219,7 +242,7 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); error = -inode_init_always(mp->m_super, inode); if (error) { @@ -227,7 +250,7 @@ xfs_iget_cache_hit( * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_INEW; @@ -261,7 +284,7 @@ xfs_iget_cache_hit( /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); trace_xfs_iget_hit(ip); } @@ -275,7 +298,7 @@ xfs_iget_cache_hit( out_error: spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); return error; } @@ -397,7 +420,7 @@ xfs_iget( xfs_agino_t agino; /* reject inode numbers outside existing AGs */ - if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; /* get the perag structure and ensure that it's inode capable */ @@ -406,15 +429,15 @@ xfs_iget( again: error = 0; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); if (ip) { - error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); if (error) goto out_error_or_again; } else { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); XFS_STATS_INC(xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, -- cgit v1.2.1 From 1a427ab0c1b205d1bda8da0b77ea9d295ac23c57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Dec 2010 17:08:41 +1100 Subject: xfs: convert pag_ici_lock to a spin lock now that we are using RCU protection for the inode cache lookups, the lock is only needed on the modification side. Hence it is not necessary for the lock to be a rwlock as there are no read side holders anymore. Convert it to a spin lock to reflect it's exclusive nature. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iget.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/xfs/xfs_iget.c') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 04ed09b907b8..3ecad00e8409 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -260,7 +260,7 @@ xfs_iget_cache_hit( goto out_error; } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); ip->i_flags |= XFS_INEW; @@ -273,7 +273,7 @@ xfs_iget_cache_hit( &xfs_iolock_active, "xfs_iolock_active"); spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -351,7 +351,7 @@ xfs_iget_cache_miss( BUG(); } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); /* insert the new inode */ error = radix_tree_insert(&pag->pag_ici_root, agino, ip); @@ -366,14 +366,14 @@ xfs_iget_cache_miss( ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); -- cgit v1.2.1