From 174c27d0f9ef82baf257e47deb9927aa59cf889b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Aug 2020 19:48:34 -0700 Subject: [PATCH 01/24] btrfs: delete duplicated words + other fixes in comments [ Upstream commit 260db43cd2f556677f6ae818ba09f997eed81004 ] Delete repeated words in fs/btrfs/. {to, the, a, and old} and change "into 2 part" to "into 2 parts". Reviewed-by: Nikolay Borisov Signed-off-by: Randy Dunlap Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/block-group.c | 2 +- fs/btrfs/ctree.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/tree-log.c | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a352c1704042..e98d6ea35ea8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2637,7 +2637,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ab69e3563b12..dac30b00d14b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5232,7 +5232,7 @@ again: slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..1d28333bb798 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2816,7 +2816,7 @@ int open_ctree(struct super_block *sb, } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eca3abc1a7cd..9108a73423f7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3152,7 +3152,7 @@ static int __do_readpage(struct extent_io_tree *tree, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 23f59d463e24..d2d32fed8f2e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1339,7 +1339,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cd8e81c02f63..837bd5e29c8a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2262,7 +2262,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index afc6731bb692..dcbdd0ebea83 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4923,7 +4923,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -6426,7 +6426,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT * otherwise. * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be * committed (without attempting to sync the log). */ From cb006da62a9e526fd754dd16564a8e7967b33417 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:48 +0100 Subject: [PATCH 02/24] btrfs: do not commit logs and transactions during link and rename operations [ Upstream commit 75b463d2b47aef96fe1dc3e0237629963034764b ] Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we started to commit logs, and fallback to transaction commits when we failed to log the new names or commit the logs, after link and rename operations when the target inodes (or their parents) were previously logged in the current transaction. This was to avoid losing directories despite an explicit fsync on them when they are ancestors of some inode that got a new named logged, due to a link or rename operation. However that adds the cost of starting IO and waiting for it to complete, which can cause higher latencies for applications. Instead of doing that, just make sure that when we log a new name for an inode we don't mark any of its ancestors as logged, so that if any one does an fsync against any of them, without doing any other change on them, the fsync commits the log. This way we only pay the cost of a log commit (or a transaction commit if something goes wrong or a new block group was created) if the application explicitly asks to fsync any of the parent directories. Using dbench, which mixes several filesystems operations including renames, revealed some significant latency gains. The following script that uses dbench was used to test this: #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-m single -d single" THREADS=16 echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -t 300 -D $MNT $THREADS umount $MNT The test was run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). Results before this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 10750455 0.011 155.088 Close 7896674 0.001 0.243 Rename 455222 2.158 1101.947 Unlink 2171189 0.067 121.638 Deltree 256 2.425 7.816 Mkdir 128 0.002 0.003 Qpathinfo 9744323 0.006 21.370 Qfileinfo 1707092 0.001 0.146 Qfsinfo 1786756 0.001 11.228 Sfileinfo 875612 0.003 21.263 Find 3767281 0.025 9.617 WriteX 5356924 0.011 211.390 ReadX 16852694 0.003 9.442 LockX 35008 0.002 0.119 UnlockX 35008 0.001 0.138 Flush 753458 4.252 1102.249 Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms Results after this patch: 16 clients, after Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11471098 0.012 448.281 Close 8426396 0.001 0.925 Rename 485746 0.123 267.183 Unlink 2316477 0.080 63.433 Deltree 288 2.830 11.144 Mkdir 144 0.003 0.010 Qpathinfo 10397420 0.006 10.288 Qfileinfo 1822039 0.001 0.169 Qfsinfo 1906497 0.002 14.039 Sfileinfo 934433 0.004 2.438 Find 4019879 0.026 10.200 WriteX 5718932 0.011 200.985 ReadX 17981671 0.003 10.036 LockX 37352 0.002 0.076 UnlockX 37352 0.001 0.109 Flush 804018 5.015 778.033 Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms (+6.5% throughput, -29.4% max latency, -75.8% rename latency) Test case generic/498 from fstests tests the scenario that the previously mentioned commit fixed. Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/inode.c | 115 +++++--------------------------------------- fs/btrfs/tree-log.c | 100 +++++++++++++++++--------------------- fs/btrfs/tree-log.h | 14 ++---- 3 files changed, 60 insertions(+), 169 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 025b02e9799f..8959d011aafa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6992,7 +6992,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -7007,12 +7006,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -9699,27 +9693,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9861,30 +9847,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -9917,46 +9887,13 @@ out_fail: dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -10024,11 +9961,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -10178,17 +10113,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -10225,23 +10151,8 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dcbdd0ebea83..53607156b008 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -174,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -5379,19 +5379,34 @@ log_extents: } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -6417,26 +6432,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6451,34 +6453,18 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; + return; - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..ddfc6789d9bf 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,6 +16,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; }; @@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } @@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif From b7f0fa2192c5263a57033de763d2d00fe7c05116 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:23 +0000 Subject: [PATCH 03/24] btrfs: fix race causing unnecessary inode logging during link and rename [ Upstream commit de53d892e5c51dfa0a158e812575a75a6c991f39 ] When we are doing a rename or a link operation for an inode that was logged in the previous transaction and that transaction is still committing, we have a time window where we incorrectly consider that the inode was logged previously in the current transaction and therefore decide to log it to update it in the log. The following steps give an example on how this happens during a link operation: 1) Inode X is logged in transaction 1000, so its logged_trans field is set to 1000; 2) Task A starts to commit transaction 1000; 3) The state of transaction 1000 is changed to TRANS_STATE_UNBLOCKED; 4) Task B starts a link operation for inode X, and as a consequence it starts transaction 1001; 5) Task A is still committing transaction 1000, therefore the value stored at fs_info->last_trans_committed is still 999; 6) Task B calls btrfs_log_new_name(), it reads a value of 999 from fs_info->last_trans_committed and because the logged_trans field of inode X has a value of 1000, the function does not return immediately, instead it proceeds to logging the inode, which should not happen because the inode was logged in the previous transaction (1000) and not in the current one (1001). This is not a functional problem, just wasted time and space logging an inode that does not need to be logged, contributing to higher latency for link and rename operations. So fix this by comparing the inodes' logged_trans field with the generation of the current transaction instead of comparing with the value stored in fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename operations. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 53607156b008..9912b5231bcf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6437,7 +6437,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_log_ctx ctx; /* @@ -6451,8 +6450,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans <= fs_info->last_trans_committed && - (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) + if (inode->logged_trans < trans->transid && + (!old_dir || old_dir->logged_trans < trans->transid)) return; btrfs_init_log_ctx(&ctx, &inode->vfs_inode); From 86f2a3e9aae9f88ef2189fdaa2ccf9e7321dcc8f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Jul 2021 11:24:43 +0100 Subject: [PATCH 04/24] btrfs: fix lost inode on log replay after mix of fsync, rename and inode eviction [ Upstream commit ecc64fab7d49c678e70bd4c35fe64d2ab3e3d212 ] When checking if we need to log the new name of a renamed inode, we are checking if the inode and its parent inode have been logged before, and if not we don't log the new name. The check however is buggy, as it directly compares the logged_trans field of the inodes versus the ID of the current transaction. The problem is that logged_trans is a transient field, only stored in memory and never persisted in the inode item, so if an inode was logged before, evicted and reloaded, its logged_trans field is set to a value of 0, meaning the check will return false and the new name of the renamed inode is not logged. If the old parent directory was previously fsynced and we deleted the logged directory entries corresponding to the old name, we end up with a log that when replayed will delete the renamed inode. The following example triggers the problem: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/A $ mkdir /mnt/B $ echo -n "hello world" > /mnt/A/foo $ sync # Add some new file to A and fsync directory A. $ touch /mnt/A/bar $ xfs_io -c "fsync" /mnt/A # Now trigger inode eviction. We are only interested in triggering # eviction for the inode of directory A. $ echo 2 > /proc/sys/vm/drop_caches # Move foo from directory A to directory B. # This deletes the directory entries for foo in A from the log, and # does not add the new name for foo in directory B to the log, because # logged_trans of A is 0, which is less than the current transaction ID. $ mv /mnt/A/foo /mnt/B/foo # Now make an fsync to anything except A, B or any file inside them, # like for example create a file at the root directory and fsync this # new file. This syncs the log that contains all the changes done by # previous rename operation. $ touch /mnt/baz $ xfs_io -c "fsync" /mnt/baz # Mount the filesystem and replay the log. $ mount /dev/sdc /mnt # Check the filesystem content. $ ls -1R /mnt /mnt/: A B baz /mnt/A: bar /mnt/B: $ # File foo is gone, it's neither in A/ nor in B/. Fix this by using the inode_logged() helper at btrfs_log_new_name(), which safely checks if an inode was logged before in the current transaction. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9912b5231bcf..5412361d0c27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6450,8 +6450,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans < trans->transid && - (!old_dir || old_dir->logged_trans < trans->transid)) + if (!inode_logged(trans, inode) && + (!old_dir || !inode_logged(trans, old_dir))) return; btrfs_init_log_ctx(&ctx, &inode->vfs_inode); From b72f2d9e91e1fa10f6b537b40836bbff61d5d819 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 27 Jun 2021 16:04:18 +0800 Subject: [PATCH 05/24] regulator: rt5033: Fix n_voltages settings for BUCK and LDO [ Upstream commit 6549c46af8551b346bcc0b9043f93848319acd5c ] For linear regulators, the n_voltages should be (max - min) / step + 1. Buck voltage from 1v to 3V, per step 100mV, and vout mask is 0x1f. If value is from 20 to 31, the voltage will all be fixed to 3V. And LDO also, just vout range is different from 1.2v to 3v, step is the same. If value is from 18 to 31, the voltage will also be fixed to 3v. Signed-off-by: Axel Lin Reviewed-by: ChiYuan Huang Link: https://lore.kernel.org/r/20210627080418.1718127-1-axel.lin@ingics.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- include/linux/mfd/rt5033-private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h index f812105c538c..f2271bfb3273 100644 --- a/include/linux/mfd/rt5033-private.h +++ b/include/linux/mfd/rt5033-private.h @@ -200,13 +200,13 @@ enum rt5033_reg { #define RT5033_REGULATOR_BUCK_VOLTAGE_MIN 1000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 21 /* RT5033 regulator LDO output voltage uV */ #define RT5033_REGULATOR_LDO_VOLTAGE_MIN 1200000U #define RT5033_REGULATOR_LDO_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_LDO_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 19 /* RT5033 regulator SAFE LDO output voltage uV */ #define RT5033_REGULATOR_SAFE_LDO_VOLTAGE 4900000U From e2cccb839a184d66087fae745c5d9d578088f0ec Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Wed, 30 Jun 2021 10:45:19 +0200 Subject: [PATCH 06/24] spi: stm32h7: fix full duplex irq handler handling [ Upstream commit e4a5c19888a5f8a9390860ca493e643be58c8791 ] In case of Full-Duplex mode, DXP flag is set when RXP and TXP flags are set. But to avoid 2 different handlings, just add TXP and RXP flag in the mask instead of DXP, and then keep the initial handling of TXP and RXP events. Also rephrase comment about EOTIE which is one of the interrupt enable bits. It is not triggered by any event. Signed-off-by: Amelie Delaunay Signed-off-by: Alain Volmat Reviewed-by: Amelie Delaunay Link: https://lore.kernel.org/r/1625042723-661-3-git-send-email-alain.volmat@foss.st.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-stm32.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index e9d48e94f5ed..9ae16092206d 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -913,15 +913,18 @@ static irqreturn_t stm32h7_spi_irq_thread(int irq, void *dev_id) ier = readl_relaxed(spi->base + STM32H7_SPI_IER); mask = ier; - /* EOTIE is triggered on EOT, SUSP and TXC events. */ + /* + * EOTIE enables irq from EOT, SUSP and TXC events. We need to set + * SUSP to acknowledge it later. TXC is automatically cleared + */ + mask |= STM32H7_SPI_SR_SUSP; /* - * When TXTF is set, DXPIE and TXPIE are cleared. So in case of - * Full-Duplex, need to poll RXP event to know if there are remaining - * data, before disabling SPI. + * DXPIE is set in Full-Duplex, one IT will be raised if TXP and RXP + * are set. So in case of Full-Duplex, need to poll TXP and RXP event. */ - if (spi->rx_buf && !spi->cur_usedma) - mask |= STM32H7_SPI_SR_RXP; + if ((spi->cur_comm == SPI_FULL_DUPLEX) && !spi->cur_usedma) + mask |= STM32H7_SPI_SR_TXP | STM32H7_SPI_SR_RXP; if (!(sr & mask)) { dev_warn(spi->dev, "spurious IT (sr=0x%08x, ier=0x%08x)\n", From a57c75ff070044b9da238143adafd684f0d5d7cc Mon Sep 17 00:00:00 2001 From: Kyle Russell Date: Mon, 21 Jun 2021 21:09:41 -0400 Subject: [PATCH 07/24] ASoC: tlv320aic31xx: fix reversed bclk/wclk master bits [ Upstream commit 9cf76a72af6ab81030dea6481b1d7bdd814fbdaf ] These are backwards from Table 7-71 of the TLV320AIC3100 spec [1]. This was broken in 12eb4d66ba2e when BCLK_MASTER and WCLK_MASTER were converted from 0x08 and 0x04 to BIT(2) and BIT(3), respectively. -#define AIC31XX_BCLK_MASTER 0x08 -#define AIC31XX_WCLK_MASTER 0x04 +#define AIC31XX_BCLK_MASTER BIT(2) +#define AIC31XX_WCLK_MASTER BIT(3) Probably just a typo since the defines were not listed in bit order. [1] https://www.ti.com/lit/gpn/tlv320aic3100 Signed-off-by: Kyle Russell Link: https://lore.kernel.org/r/20210622010941.241386-1-bkylerussell@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/tlv320aic31xx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h index cb024955c978..73c5f6c8ed69 100644 --- a/sound/soc/codecs/tlv320aic31xx.h +++ b/sound/soc/codecs/tlv320aic31xx.h @@ -151,8 +151,8 @@ struct aic31xx_pdata { #define AIC31XX_WORD_LEN_24BITS 0x02 #define AIC31XX_WORD_LEN_32BITS 0x03 #define AIC31XX_IFACE1_MASTER_MASK GENMASK(3, 2) -#define AIC31XX_BCLK_MASTER BIT(2) -#define AIC31XX_WCLK_MASTER BIT(3) +#define AIC31XX_BCLK_MASTER BIT(3) +#define AIC31XX_WCLK_MASTER BIT(2) /* AIC31XX_DATA_OFFSET */ #define AIC31XX_DATA_OFFSET_MASK GENMASK(7, 0) From 6bc48348eca77837cb5307b83d73fcbbfb6efe85 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Jul 2021 19:00:21 +0200 Subject: [PATCH 08/24] r8152: Fix potential PM refcount imbalance [ Upstream commit 9c23aa51477a37f8b56c3c40192248db0663c196 ] rtl8152_close() takes the refcount via usb_autopm_get_interface() but it doesn't release when RTL8152_UNPLUG test hits. This may lead to the imbalance of PM refcount. This patch addresses it. Link: https://bugzilla.suse.com/show_bug.cgi?id=1186194 Signed-off-by: Takashi Iwai Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/r8152.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 24d124633037..873f288e7cec 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4317,10 +4317,11 @@ static int rtl8152_close(struct net_device *netdev) tp->rtl_ops.down(tp); mutex_unlock(&tp->control); - - usb_autopm_put_interface(tp->intf); } + if (!res) + usb_autopm_put_interface(tp->intf); + free_all_mem(tp); return res; From bf692e7ef657bae632b1b61482116d571b56d7b8 Mon Sep 17 00:00:00 2001 From: Jia He Date: Thu, 15 Jul 2021 16:08:21 +0800 Subject: [PATCH 09/24] qed: fix possible unpaired spin_{un}lock_bh in _qed_mcp_cmd_and_union() [ Upstream commit 6206b7981a36476f4695d661ae139f7db36a802d ] Liajian reported a bug_on hit on a ThunderX2 arm64 server with FastLinQ QL41000 ethernet controller: BUG: scheduling while atomic: kworker/0:4/531/0x00000200 [qed_probe:488()]hw prepare failed kernel BUG at mm/vmalloc.c:2355! Internal error: Oops - BUG: 0 [#1] SMP CPU: 0 PID: 531 Comm: kworker/0:4 Tainted: G W 5.4.0-77-generic #86-Ubuntu pstate: 00400009 (nzcv daif +PAN -UAO) Call trace: vunmap+0x4c/0x50 iounmap+0x48/0x58 qed_free_pci+0x60/0x80 [qed] qed_probe+0x35c/0x688 [qed] __qede_probe+0x88/0x5c8 [qede] qede_probe+0x60/0xe0 [qede] local_pci_probe+0x48/0xa0 work_for_cpu_fn+0x24/0x38 process_one_work+0x1d0/0x468 worker_thread+0x238/0x4e0 kthread+0xf0/0x118 ret_from_fork+0x10/0x18 In this case, qed_hw_prepare() returns error due to hw/fw error, but in theory work queue should be in process context instead of interrupt. The root cause might be the unpaired spin_{un}lock_bh() in _qed_mcp_cmd_and_union(), which causes botton half is disabled incorrectly. Reported-by: Lijian Zhang Signed-off-by: Jia He Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 9401b49275f0..5d85ae59bc51 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -498,14 +498,18 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); - if (!qed_mcp_has_pending_cmd(p_hwfn)) + if (!qed_mcp_has_pending_cmd(p_hwfn)) { + spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); break; + } rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt); - if (!rc) + if (!rc) { + spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); break; - else if (rc != -EAGAIN) + } else if (rc != -EAGAIN) { goto err; + } spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); @@ -522,6 +526,8 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, return -EAGAIN; } + spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); + /* Send the mailbox command */ qed_mcp_reread_offsets(p_hwfn, p_ptt); seq_num = ++p_hwfn->mcp_info->drv_mb_seq; @@ -548,14 +554,18 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); - if (p_cmd_elem->b_is_completed) + if (p_cmd_elem->b_is_completed) { + spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); break; + } rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt); - if (!rc) + if (!rc) { + spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); break; - else if (rc != -EAGAIN) + } else if (rc != -EAGAIN) { goto err; + } spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); } while (++cnt < max_retries); @@ -576,6 +586,7 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, return -EAGAIN; } + spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); qed_mcp_cmd_del_elem(p_hwfn, p_cmd_elem); spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); From b508b652d4f3d03929e6f0533a44b3dea7493ece Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 15 Jul 2021 16:59:00 -0700 Subject: [PATCH 10/24] net: Fix zero-copy head len calculation. [ Upstream commit a17ad0961706244dce48ec941f7e476a38c0e727 ] In some cases skb head could be locked and entire header data is pulled from skb. When skb_zerocopy() called in such cases, following BUG is triggered. This patch fixes it by copying entire skb in such cases. This could be optimized incase this is performance bottleneck. ---8<--- kernel BUG at net/core/skbuff.c:2961! invalid opcode: 0000 [#1] SMP PTI CPU: 2 PID: 0 Comm: swapper/2 Tainted: G OE 5.4.0-77-generic #86-Ubuntu Hardware name: OpenStack Foundation OpenStack Nova, BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:skb_zerocopy+0x37a/0x3a0 RSP: 0018:ffffbcc70013ca38 EFLAGS: 00010246 Call Trace: queue_userspace_packet+0x2af/0x5e0 [openvswitch] ovs_dp_upcall+0x3d/0x60 [openvswitch] ovs_dp_process_packet+0x125/0x150 [openvswitch] ovs_vport_receive+0x77/0xd0 [openvswitch] netdev_port_receive+0x87/0x130 [openvswitch] netdev_frame_hook+0x4b/0x60 [openvswitch] __netif_receive_skb_core+0x2b4/0xc90 __netif_receive_skb_one_core+0x3f/0xa0 __netif_receive_skb+0x18/0x60 process_backlog+0xa9/0x160 net_rx_action+0x142/0x390 __do_softirq+0xe1/0x2d6 irq_exit+0xae/0xb0 do_IRQ+0x5a/0xf0 common_interrupt+0xf/0xf Code that triggered BUG: int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ int ret; struct page *page; unsigned int offset; BUG_ON(!from->head_frag && !hlen); Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/skbuff.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 99d6f4d1297c..7dba091bc861 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2921,8 +2921,11 @@ skb_zerocopy_headlen(const struct sk_buff *from) if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + } if (skb_has_frag_list(from)) hlen = from->len; From 0ea2f55babb729007f3b27fd3d444b121deecbb2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Jul 2021 09:44:39 -0700 Subject: [PATCH 11/24] nvme: fix nvme_setup_command metadata trace event [ Upstream commit 234211b8dd161fa25f192c78d5a8d2dd6bf920a0 ] The metadata address is set after the trace event, so the trace is not capturing anything useful. Rather than logging the memory address, it's useful to know if the command carries a metadata payload, so change the trace event to log that true/false state instead. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index daaf700eae79..35bac7a25422 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -56,7 +56,7 @@ TRACE_EVENT(nvme_setup_cmd, __field(u8, fctype) __field(u16, cid) __field(u32, nsid) - __field(u64, metadata) + __field(bool, metadata) __array(u8, cdw10, 24) ), TP_fast_assign( @@ -66,13 +66,13 @@ TRACE_EVENT(nvme_setup_cmd, __entry->flags = cmd->common.flags; __entry->cid = cmd->common.command_id; __entry->nsid = le32_to_cpu(cmd->common.nsid); - __entry->metadata = le64_to_cpu(cmd->common.metadata); + __entry->metadata = !!blk_integrity_rq(req); __entry->fctype = cmd->fabrics.fctype; __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, sizeof(__entry->cdw10)); ), - TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, From 38f54217b423c0101d03a00feec6fb8ec608b12e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jul 2021 15:25:54 -0700 Subject: [PATCH 12/24] ACPI: fix NULL pointer dereference [ Upstream commit fc68f42aa737dc15e7665a4101d4168aadb8e4c4 ] Commit 71f642833284 ("ACPI: utils: Fix reference counting in for_each_acpi_dev_match()") started doing "acpi_dev_put()" on a pointer that was possibly NULL. That fails miserably, because that helper inline function is not set up to handle that case. Just make acpi_dev_put() silently accept a NULL pointer, rather than calling down to put_device() with an invalid offset off that NULL pointer. Link: https://lore.kernel.org/lkml/a607c149-6bf6-0fd0-0e31-100378504da2@kernel.dk/ Reported-and-tested-by: Jens Axboe Tested-by: Daniel Scally Cc: Andy Shevchenko Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/acpi/acpi_bus.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 4d67a67964fa..1e5ae3b01eb2 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -681,7 +681,8 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv); static inline void acpi_dev_put(struct acpi_device *adev) { - put_device(&adev->dev); + if (adev) + put_device(&adev->dev); } #else /* CONFIG_ACPI */ From 44f522298c946d7f7b68ce0f5c7986f4c926281a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Aug 2021 20:58:57 +0200 Subject: [PATCH 13/24] Revert "Bluetooth: Shutdown controller after workqueues are flushed or cancelled" This reverts commit aa9a2ec7ee08dda41bb565b692f34c620d63b517 which is commit 0ea9fd001a14ebc294f112b0361a4e601551d508 upstream. It has been reported to have problems: https://lore.kernel.org/linux-bluetooth/8735ryk0o7.fsf@baylibre.com/ Reported-by: Guenter Roeck Cc: Kai-Heng Feng Cc: Marcel Holtmann Cc: Sasha Levin Link: https://lore.kernel.org/r/efee3a58-a4d2-af22-0931-e81b877ab539@roeck-us.net Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/hci_core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 37b585c9e857..21a7ea9b70c8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1672,6 +1672,14 @@ int hci_dev_do_close(struct hci_dev *hdev) BT_DBG("%s %p", hdev->name, hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + hdev->shutdown(hdev); + } + cancel_delayed_work(&hdev->power_off); hci_request_cancel_all(hdev); @@ -1745,14 +1753,6 @@ int hci_dev_do_close(struct hci_dev *hdev) clear_bit(HCI_INIT, &hdev->flags); } - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - test_bit(HCI_UP, &hdev->flags)) { - /* Execute vendor specific shutdown routine */ - if (hdev->shutdown) - hdev->shutdown(hdev); - } - /* flush cmd work */ flush_work(&hdev->cmd_work); From 1b38f70bbc7c2b293d0352010228b896c3cdd914 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 24 Jun 2021 10:50:59 +0100 Subject: [PATCH 14/24] firmware: arm_scmi: Ensure drivers provide a probe function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5e469dac326555d2038d199a6329458cc82a34e5 upstream. The bus probe callback calls the driver callback without further checking. Better be safe than sorry and refuse registration of a driver without a probe function to prevent a NULL pointer exception. Link: https://lore.kernel.org/r/20210624095059.4010157-2-sudeep.holla@arm.com Fixes: 933c504424a2 ("firmware: arm_scmi: add scmi protocol bus to enumerate protocol devices") Reported-by: Uwe Kleine-König Tested-by: Cristian Marussi Reviewed-by: Cristian Marussi Acked-by: Uwe Kleine-König Signed-off-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/arm_scmi/bus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/firmware/arm_scmi/bus.c b/drivers/firmware/arm_scmi/bus.c index 7a30952b463d..66d445b14e51 100644 --- a/drivers/firmware/arm_scmi/bus.c +++ b/drivers/firmware/arm_scmi/bus.c @@ -100,6 +100,9 @@ int scmi_driver_register(struct scmi_driver *driver, struct module *owner, { int retval; + if (!driver->probe) + return -EINVAL; + driver->driver.bus = &scmi_bus_type; driver->driver.name = driver->name; driver->driver.owner = owner; From 76f5314d7859dc36121274dfd7d1a7b0545c0032 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Tue, 8 Jun 2021 11:30:56 +0100 Subject: [PATCH 15/24] firmware: arm_scmi: Add delayed response status check commit f1748b1ee1fa0fd1a074504045b530b62f949188 upstream. A successfully received delayed response could anyway report a failure at the protocol layer in the message status field. Add a check also for this error condition. Link: https://lore.kernel.org/r/20210608103056.3388-1-cristian.marussi@arm.com Fixes: 58ecdf03dbb9 ("firmware: arm_scmi: Add support for asynchronous commands and delayed response") Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/arm_scmi/driver.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c index 48e6e2b48924..4e43bdfa041f 100644 --- a/drivers/firmware/arm_scmi/driver.c +++ b/drivers/firmware/arm_scmi/driver.c @@ -515,8 +515,12 @@ int scmi_do_xfer_with_response(const struct scmi_handle *handle, xfer->async_done = &async_response; ret = scmi_do_xfer(handle, xfer); - if (!ret && !wait_for_completion_timeout(xfer->async_done, timeout)) - ret = -ETIMEDOUT; + if (!ret) { + if (!wait_for_completion_timeout(xfer->async_done, timeout)) + ret = -ETIMEDOUT; + else if (xfer->hdr.status) + ret = scmi_to_linux_errno(xfer->hdr.status); + } xfer->async_done = NULL; return ret; From a0a9546aaec31237eada027a02fc925ba7454289 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 6 Aug 2021 08:28:48 +0200 Subject: [PATCH 16/24] Revert "watchdog: iTCO_wdt: Account for rebooting on second timeout" This reverts commit f58ab0b02ee7b095e0cae4ba706caa86fff5557b which is commit cb011044e34c293e139570ce5c01aed66a34345c upstream. It is reported to cause problems with systems and probably should not have been backported in the first place :( Link: https://lore.kernel.org/r/20210803165108.4154cd52@endymion Reported-by: Jean Delvare Cc: Jan Kiszka Cc: Guenter Roeck Cc: Guenter Roeck Cc: Wim Van Sebroeck Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/watchdog/iTCO_wdt.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 08e534fba1bf..e707c4797f76 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -72,8 +72,6 @@ #define TCOBASE(p) ((p)->tco_res->start) /* SMI Control and Enable Register */ #define SMI_EN(p) ((p)->smi_res->start) -#define TCO_EN (1 << 13) -#define GBL_SMI_EN (1 << 0) #define TCO_RLD(p) (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */ #define TCOv1_TMR(p) (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/ @@ -346,12 +344,8 @@ static int iTCO_wdt_set_timeout(struct watchdog_device *wd_dev, unsigned int t) tmrval = seconds_to_ticks(p, t); - /* - * If TCO SMIs are off, the timer counts down twice before rebooting. - * Otherwise, the BIOS generally reboots when the SMI triggers. - */ - if (p->smi_res && - (SMI_EN(p) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN)) + /* For TCO v1 the timer counts down twice before rebooting */ + if (p->iTCO_version == 1) tmrval /= 2; /* from the specs: */ @@ -516,7 +510,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) * Disables TCO logic generating an SMI# */ val32 = inl(SMI_EN(p)); - val32 &= ~TCO_EN; /* Turn off SMI clearing watchdog */ + val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ outl(val32, SMI_EN(p)); } From 283d742988f6b304f32110f39e189a00d4e52b92 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Aug 2021 18:53:38 +0300 Subject: [PATCH 17/24] bpf: Inherit expanded/patched seen count from old aux data commit d203b0fd863a2261e5d00b97f3d060c4c2a6db71 upstream Instead of relying on current env->pass_cnt, use the seen count from the old aux data in adjust_insn_aux_data(), and expand it to the new range of patched instructions. This change is valid given we always expand 1:n with n>=1, so what applies to the old/original instruction needs to apply for the replacement as well. Not relying on env->pass_cnt is a prerequisite for a later change where we want to avoid marking an instruction seen when verified under speculative execution path. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: declare old_data as bool instead of u32 (struct bpf_insn_aux_data.seen is bool in 5.4)] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aefd94794796..526e52f45ab3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8304,6 +8304,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + bool old_seen = old_data[off].seen; u32 prog_len; int i; @@ -8324,7 +8325,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = true; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; From d2f790327f83b457db357e7c66f942bc00d43462 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Aug 2021 18:53:39 +0300 Subject: [PATCH 18/24] bpf: Do not mark insn as seen under speculative path verification commit fe9a5ca7e370e613a9a75a13008a3845ea759d6e upstream ... in such circumstances, we do not want to mark the instruction as seen given the goal is still to jmp-1 rewrite/sanitize dead code, if it is not reachable from the non-speculative path verification. We do however want to verify it for safety regardless. With the patch as-is all the insns that have been marked as seen before the patch will also be marked as seen after the patch (just with a potentially different non-zero count). An upcoming patch will also verify paths that are unreachable in the non-speculative domain, hence this extension is needed. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: - env->pass_cnt is not used in 5.4, so adjust sanitize_mark_insn_seen() to assign "true" instead - drop sanitize_insn_aux_data() comment changes, as the function is not present in 5.4] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 526e52f45ab3..02a04a30070b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4435,6 +4435,19 @@ do_sim: return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = true; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -7790,7 +7803,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -8025,7 +8038,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; From fd568de5806f8859190e6305a1792ba8cb20de61 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Aug 2021 18:53:40 +0300 Subject: [PATCH 19/24] bpf: Fix leakage under speculation on mispredicted branches commit 9183671af6dbf60a1219371d4ed73e23f43b49db upstream The verifier only enumerates valid control-flow paths and skips paths that are unreachable in the non-speculative domain. And so it can miss issues under speculative execution on mispredicted branches. For example, a type confusion has been demonstrated with the following crafted program: // r0 = pointer to a map array entry // r6 = pointer to readable stack slot // r9 = scalar controlled by attacker 1: r0 = *(u64 *)(r0) // cache miss 2: if r0 != 0x0 goto line 4 3: r6 = r9 4: if r0 != 0x1 goto line 6 5: r9 = *(u8 *)(r6) 6: // leak r9 Since line 3 runs iff r0 == 0 and line 5 runs iff r0 == 1, the verifier concludes that the pointer dereference on line 5 is safe. But: if the attacker trains both the branches to fall-through, such that the following is speculatively executed ... r6 = r9 r9 = *(u8 *)(r6) // leak r9 ... then the program will dereference an attacker-controlled value and could leak its content under speculative execution via side-channel. This requires to mistrain the branch predictor, which can be rather tricky, because the branches are mutually exclusive. However such training can be done at congruent addresses in user space using different branches that are not mutually exclusive. That is, by training branches in user space ... A: if r0 != 0x0 goto line C B: ... C: if r0 != 0x0 goto line D D: ... ... such that addresses A and C collide to the same CPU branch prediction entries in the PHT (pattern history table) as those of the BPF program's lines 2 and 4, respectively. A non-privileged attacker could simply brute force such collisions in the PHT until observing the attack succeeding. Alternative methods to mistrain the branch predictor are also possible that avoid brute forcing the collisions in the PHT. A reliable attack has been demonstrated, for example, using the following crafted program: // r0 = pointer to a [control] map array entry // r7 = *(u64 *)(r0 + 0), training/attack phase // r8 = *(u64 *)(r0 + 8), oob address // [...] // r0 = pointer to a [data] map array entry 1: if r7 == 0x3 goto line 3 2: r8 = r0 // crafted sequence of conditional jumps to separate the conditional // branch in line 193 from the current execution flow 3: if r0 != 0x0 goto line 5 4: if r0 == 0x0 goto exit 5: if r0 != 0x0 goto line 7 6: if r0 == 0x0 goto exit [...] 187: if r0 != 0x0 goto line 189 188: if r0 == 0x0 goto exit // load any slowly-loaded value (due to cache miss in phase 3) ... 189: r3 = *(u64 *)(r0 + 0x1200) // ... and turn it into known zero for verifier, while preserving slowly- // loaded dependency when executing: 190: r3 &= 1 191: r3 &= 2 // speculatively bypassed phase dependency 192: r7 += r3 193: if r7 == 0x3 goto exit 194: r4 = *(u8 *)(r8 + 0) // leak r4 As can be seen, in training phase (phase != 0x3), the condition in line 1 turns into false and therefore r8 with the oob address is overridden with the valid map value address, which in line 194 we can read out without issues. However, in attack phase, line 2 is skipped, and due to the cache miss in line 189 where the map value is (zeroed and later) added to the phase register, the condition in line 193 takes the fall-through path due to prior branch predictor training, where under speculation, it'll load the byte at oob address r8 (unknown scalar type at that point) which could then be leaked via side-channel. One way to mitigate these is to 'branch off' an unreachable path, meaning, the current verification path keeps following the is_branch_taken() path and we push the other branch to the verification stack. Given this is unreachable from the non-speculative domain, this branch's vstate is explicitly marked as speculative. This is needed for two reasons: i) if this path is solely seen from speculative execution, then we later on still want the dead code elimination to kick in in order to sanitize these instructions with jmp-1s, and ii) to ensure that paths walked in the non-speculative domain are not pruned from earlier walks of paths walked in the speculative domain. Additionally, for robustness, we mark the registers which have been part of the conditional as unknown in the speculative path given there should be no assumptions made on their content. The fix in here mitigates type confusion attacks described earlier due to i) all code paths in the BPF program being explored and ii) existing verifier logic already ensuring that given memory access instruction references one specific data structure. An alternative to this fix that has also been looked at in this scope was to mark aux->alu_state at the jump instruction with a BPF_JMP_TAKEN state as well as direction encoding (always-goto, always-fallthrough, unknown), such that mixing of different always-* directions themselves as well as mixing of always-* with unknown directions would cause a program rejection by the verifier, e.g. programs with constructs like 'if ([...]) { x = 0; } else { x = 1; }' with subsequent 'if (x == 1) { [...] }'. For unprivileged, this would result in only single direction always-* taken paths, and unknown taken paths being allowed, such that the former could be patched from a conditional jump to an unconditional jump (ja). Compared to this approach here, it would have two downsides: i) valid programs that otherwise are not performing any pointer arithmetic, etc, would potentially be rejected/broken, and ii) we are required to turn off path pruning for unprivileged, where both can be avoided in this work through pushing the invalid branch to the verification stack. The issue was originally discovered by Adam and Ofek, and later independently discovered and reported as a result of Benedict and Piotr's research work. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Adam Morrison Reported-by: Ofek Kirzner Reported-by: Benedict Schlueter Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: use allow_ptr_leaks instead of bypass_spec_v1] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02a04a30070b..52c2b11a0b47 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4346,6 +4346,27 @@ struct bpf_sanitize_info { bool mask_to_left; }; +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -4429,7 +4450,8 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; @@ -6079,14 +6101,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->allow_ptr_leaks && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->allow_ptr_leaks && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } From 8dec99abcd74b817666f48645fe54fc759163e52 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Aug 2021 18:53:41 +0300 Subject: [PATCH 20/24] bpf: Test_verifier, add alu32 bounds tracking tests commit 41f70fe0649dddf02046315dc566e06da5a2dc91 upstream Its possible to have divergent ALU32 and ALU64 bounds when using JMP32 instructins and ALU64 arithmatic operations. Sometimes the clang will even generate this code. Because the case is a bit tricky lets add a specific test for it. Here is pseudocode asm version to illustrate the idea, 1 r0 = 0xffffffff00000001; 2 if w0 > 1 goto %l[fail]; 3 r0 += 1 5 if w0 > 2 goto %l[fail] 6 exit The intent here is the verifier will fail the load if the 32bit bounds are not tracked correctly through ALU64 op. Similarly we can check the 64bit bounds are correctly zero extended after ALU32 ops. 1 r0 = 0xffffffff00000001; 2 w0 += 1 2 if r0 > 3 goto %l[fail]; 6 exit The above will fail if we do not correctly zero extend 64bit bounds after 32bit op. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158560430155.10843.514209255758200922.stgit@john-Precision-5820-Tower Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/verifier/bounds.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index d55f476f2237..d8e5388c9ba7 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -506,3 +506,42 @@ .errstr = "map_value pointer and 1000000000000", .result = REJECT }, +{ + "bounds check mixed 32bit and 64bit arithmatic. test1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 1, 3), + /* check ALU64 op keeps 32bit bounds */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, +{ + "bounds check mixed 32bit and 64bit arithmatic. test2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_MOV64_IMM(BPF_REG_2, 3), + /* r1 = 0x2 */ + BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1), + /* check ALU32 op zero extends 64bit bounds */ + BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, From d3796e8f6b3d25c74e416ec3eb0efc011acb547c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Aug 2021 18:53:42 +0300 Subject: [PATCH 21/24] bpf, selftests: Add a verifier test for assigning 32bit reg states to 64bit ones commit cf66c29bd7534813d2e1971fab71e25fe87c7e0a upstream Added a verifier test for assigning 32bit reg states to 64bit where 32bit reg holds a constant value of 0. Without previous kernel verifier.c fix, the test in this patch will fail. Signed-off-by: Yonghong Song Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159077335867.6014.2075350327073125374.stgit@john-Precision-5820-Tower Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/verifier/bounds.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index d8e5388c9ba7..c42ce135786a 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -545,3 +545,25 @@ }, .result = ACCEPT }, +{ + "assigning 32bit bounds to 64bit for wA = 0, wB = wA", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV32_IMM(BPF_REG_9, 0), + BPF_MOV32_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_8, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, From a0f66ddf05c2050e1b7f53256bd9c25c2bb3022b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Aug 2021 18:53:43 +0300 Subject: [PATCH 22/24] bpf, selftests: Adjust few selftest outcomes wrt unreachable code commit 973377ffe8148180b2651825b92ae91988141b05 upstream In almost all cases from test_verifier that have been changed in here, we've had an unreachable path with a load from a register which has an invalid address on purpose. This was basically to make sure that we never walk this path and to have the verifier complain if it would otherwise. Change it to match on the right error for unprivileged given we now test these paths under speculative execution. There's one case where we match on exact # of insns_processed. Due to the extra path, this will of course mismatch on unprivileged. Thus, restrict the test->insn_processed check to privileged-only. In one other case, we result in a 'pointer comparison prohibited' error. This is similarly due to verifying an 'invalid' branch where we end up with a value pointer on one side of the comparison. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov [OP: ignore changes to tests that do not exist in 5.4] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/test_verifier.c | 2 +- tools/testing/selftests/bpf/verifier/bounds.c | 4 ++++ .../selftests/bpf/verifier/dead_code.c | 2 ++ tools/testing/selftests/bpf/verifier/jmp32.c | 22 +++++++++++++++++++ tools/testing/selftests/bpf/verifier/jset.c | 10 +++++---- tools/testing/selftests/bpf/verifier/unpriv.c | 2 ++ .../selftests/bpf/verifier/value_ptr_arith.c | 7 +++--- 7 files changed, 41 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index d27fd929abb9..43224c5ec1e9 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -980,7 +980,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } } - if (test->insn_processed) { + if (!unpriv && test->insn_processed) { uint32_t insn_processed; char *proc; diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index c42ce135786a..92c02e4a1b62 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -523,6 +523,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { @@ -543,6 +545,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..a7e60a773da6 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -8,6 +8,8 @@ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, }, diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index f0961c58581e..f2fabf6ebc61 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -72,6 +72,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -135,6 +137,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -198,6 +202,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -265,6 +271,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -333,6 +341,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -401,6 +411,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -469,6 +481,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -537,6 +551,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -605,6 +621,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -673,6 +691,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -741,6 +761,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c index 8dcd4e0383d5..11fc68da735e 100644 --- a/tools/testing/selftests/bpf/verifier/jset.c +++ b/tools/testing/selftests/bpf/verifier/jset.c @@ -82,8 +82,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .retval_unpriv = 1, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .retval = 1, .result = ACCEPT, }, @@ -141,7 +141,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -162,6 +163,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index c3f6f650deb7..593f5b586e87 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -418,6 +418,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R7 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 0, }, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index 28d44e6aa0b7..a8dab0f58462 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -120,7 +120,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R2 pointer comparison prohibited", .retval = 0, }, { @@ -159,7 +159,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), // fake-dead code; targeted from branch A to - // prevent dead code sanitization + // prevent dead code sanitization, rejected + // via branch B however BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -167,7 +168,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R0 invalid mem access 'inv'", .retval = 0, }, { From 03ff8a4f9db632ef8fc163b6bcdfa0689043472b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 1 Aug 2021 20:00:23 -0700 Subject: [PATCH 23/24] spi: mediatek: Fix fifo transfer commit 0d5c3954b35eddff0da0436c31e8d721eceb7dc2 upstream. Commit 3a70dd2d0503 ("spi: mediatek: fix fifo rx mode") claims that fifo RX mode was never handled, and adds the presumably missing code to the FIFO transfer function. However, the claim that receive data was not handled is incorrect. It was handled as part of interrupt handling after the transfer was complete. The code added with the above mentioned commit reads data from the receive FIFO before the transfer is started, which is wrong. This results in an actual transfer error on a Hayato Chromebook. Remove the code trying to handle receive data before the transfer is started to fix the problem. Fixes: 3a70dd2d0503 ("spi: mediatek: fix fifo rx mode") Cc: Peter Hess Cc: Frank Wunderlich Cc: Tzung-Bi Shih Cc: Hsin-Yi Wang Signed-off-by: Guenter Roeck Tested-by: Hsin-Yi Wang Tested-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20210802030023.1748777-1-linux@roeck-us.net Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-mt65xx.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index 81eac9fbd08c..cafb773f7861 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -426,24 +426,15 @@ static int mtk_spi_fifo_transfer(struct spi_master *master, mtk_spi_prepare_transfer(master, xfer); mtk_spi_setup_packet(master); - cnt = xfer->len / 4; - if (xfer->tx_buf) + if (xfer->tx_buf) { + cnt = xfer->len / 4; iowrite32_rep(mdata->base + SPI_TX_DATA_REG, xfer->tx_buf, cnt); - - if (xfer->rx_buf) - ioread32_rep(mdata->base + SPI_RX_DATA_REG, xfer->rx_buf, cnt); - - remainder = xfer->len % 4; - if (remainder > 0) { - reg_val = 0; - if (xfer->tx_buf) { + remainder = xfer->len % 4; + if (remainder > 0) { + reg_val = 0; memcpy(®_val, xfer->tx_buf + (cnt * 4), remainder); writel(reg_val, mdata->base + SPI_TX_DATA_REG); } - if (xfer->rx_buf) { - reg_val = readl(mdata->base + SPI_RX_DATA_REG); - memcpy(xfer->rx_buf + (cnt * 4), ®_val, remainder); - } } mtk_spi_enable_transfer(master); From e350cd02e293be9a6b93398b2d3ff1edf7695ab2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 8 Aug 2021 09:04:09 +0200 Subject: [PATCH 24/24] Linux 5.4.139 Link: https://lore.kernel.org/r/20210806081112.104686873@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Guenter Roeck Tested-by: Sudip Mukherjee Tested-by: Linux Kernel Functional Testing Tested-by: Aakash Hemadri Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5a9d6caef82a..1174536034b3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 138 +SUBLEVEL = 139 EXTRAVERSION = NAME = Kleptomaniac Octopus