linux-brain/fs/xfs/libxfs/xfs_trans_inode.c
Dave Chinner 774cc7c882 xfs: Don't allow logging of XFS_ISTALE inodes
[ Upstream commit 96355d5a1f0ee6dcc182c37db4894ec0c29f1692 ]

In tracking down a problem in this patchset, I discovered we are
reclaiming dirty stale inodes. This wasn't discovered until inodes
were always attached to the cluster buffer and then the rcu callback
that freed inodes was assert failing because the inode still had an
active pointer to the cluster buffer after it had been reclaimed.

Debugging the issue indicated that this was a pre-existing issue
resulting from the way the inodes are handled in xfs_inactive_ifree.
When we free a cluster buffer from xfs_ifree_cluster, all the inodes
in cache are marked XFS_ISTALE. Those that are clean have nothing
else done to them and so eventually get cleaned up by background
reclaim. i.e. it is assumed we'll never dirty/relog an inode marked
XFS_ISTALE.

On journal commit dirty stale inodes as are handled by both
buffer and inode log items to run though xfs_istale_done() and
removed from the AIL (buffer log item commit) or the log item will
simply unpin it because the buffer log item will clean it. What happens
to any specific inode is entirely dependent on which log item wins
the commit race, but the result is the same - stale inodes are
clean, not attached to the cluster buffer, and not in the AIL. Hence
inode reclaim can just free these inodes without further care.

However, if the stale inode is relogged, it gets dirtied again and
relogged into the CIL. Most of the time this isn't an issue, because
relogging simply changes the inode's location in the current
checkpoint. Problems arise, however, when the CIL checkpoints
between two transactions in the xfs_inactive_ifree() deferops
processing. This results in the XFS_ISTALE inode being redirtied
and inserted into the CIL without any of the other stale cluster
buffer infrastructure being in place.

Hence on journal commit, it simply gets unpinned, so it remains
dirty in memory. Everything in inode writeback avoids XFS_ISTALE
inodes so it can't be written back, and it is not tracked in the AIL
so there's not even a trigger to attempt to clean the inode. Hence
the inode just sits dirty in memory until inode reclaim comes along,
sees that it is XFS_ISTALE, and goes to reclaim it. This reclaiming
of a dirty inode caused use after free, list corruptions and other
nasty issues later in this patchset.

Hence this patch addresses a violation of the "never log XFS_ISTALE
inodes" caused by the deferops processing rolling a transaction
and relogging a stale inode in xfs_inactive_free. It also adds a
bunch of asserts to catch this problem in debug kernels so that
we don't reintroduce this problem in future.

Reproducer for this issue was generic/558 on a v4 filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-09-03 11:26:44 +02:00

159 lines
4.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include <linux/iversion.h>
/*
* Add a locked inode to the transaction.
*
* The inode must be locked, and it cannot be associated with any transaction.
* If lock_flags is non-zero the inode will be unlocked on transaction commit.
*/
void
xfs_trans_ijoin(
struct xfs_trans *tp,
struct xfs_inode *ip,
uint lock_flags)
{
xfs_inode_log_item_t *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
ASSERT(iip->ili_lock_flags == 0);
iip->ili_lock_flags = lock_flags;
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
/*
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &iip->ili_item);
}
/*
* Transactional inode timestamp update. Requires the inode to be locked and
* joined to the transaction supplied. Relies on the transaction subsystem to
* track dirty state and update/writeback the inode accordingly.
*/
void
xfs_trans_ichgtime(
struct xfs_trans *tp,
struct xfs_inode *ip,
int flags)
{
struct inode *inode = VFS_I(ip);
struct timespec64 tv;
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
tv = current_time(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
if (flags & XFS_ICHGTIME_CREATE) {
ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
}
}
/*
* This is called to mark the fields indicated in fieldmask as needing
* to be logged when the transaction is committed. The inode must
* already be associated with the given transaction.
*
* The values for fieldmask are defined in xfs_inode_item.h. We always
* log all of the core inode if any of it has changed, and we always log
* all of the inline data/extents/b-tree root if any of them has changed.
*/
void
xfs_trans_log_inode(
xfs_trans_t *tp,
xfs_inode_t *ip,
uint flags)
{
struct inode *inode = VFS_I(ip);
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
/*
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
* don't matter - we either will need an extra transaction in 24 hours
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
spin_lock(&inode->i_lock);
inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
spin_unlock(&inode->i_lock);
}
/*
* Record the specific change for fdatasync optimisation. This
* allows fdatasync to skip log forces for inodes that are only
* timestamp dirty. We do this before the change count so that
* the core being logged in this case does not impact on fdatasync
* behaviour.
*/
ip->i_itemp->ili_fsync_fields |= flags;
/*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
* inode locked exclusively for metadata modification, we can usually
* avoid setting XFS_ILOG_CORE if no one has queried the value since
* the last time it was incremented. If we have XFS_ILOG_CORE already
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
IS_I_VERSION(VFS_I(ip))) {
if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
flags |= XFS_ILOG_CORE;
}
tp->t_flags |= XFS_TRANS_DIRTY;
/*
* Always OR in the bits from the ili_last_fields field.
* This is to coordinate with the xfs_iflush() and xfs_iflush_done()
* routines in the eventual clearing of the ili_fields bits.
* See the big comment in xfs_iflush() for an explanation of
* this coordination mechanism.
*/
flags |= ip->i_itemp->ili_last_fields;
ip->i_itemp->ili_fields |= flags;
}
int
xfs_trans_roll_inode(
struct xfs_trans **tpp,
struct xfs_inode *ip)
{
int error;
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
error = xfs_trans_roll(tpp);
if (!error)
xfs_trans_ijoin(*tpp, ip, 0);
return error;
}