for-6.14-rc4-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAme95g8ACgkQxWXV+ddt
 WDvi3g//V55iBXnPv0Jrs7b95GRskYv8A4vJsZhGtub4PlcEh8S6Q1IoU3qwiKHv
 E2THDA/A14qetxh3tSo73+RdS3JHpIH4QKjO54k74gOh45OEUs4Lq8NBAujmpz4b
 BMZZnM5iyZipNfbebUa/XxlPLvHg8D2rUqwycS/A0c5BE56HTvVzmKL3RdUfkAvA
 uZaJa6FOKfr6ge3ikl/dm+Rl7f+ZymIK4T9XsW3Lt223siYvcLJvWEIL0tk9B1y/
 ZUQNqPOCHY0mX/zPC0425LoeH3LWDPyZPCakaY8tiwI20p/sP+hPLBC8WDrJvoam
 losu6v8EqkYK9zND/ETVq3d1Y9mzub/soKuM+aDQ/UM0JXz1vI3RYQcpskECR0Gf
 ZPq5tv+dSBbMmscvkxnkuNBaTr3IbOhkxaKwOvdoRN9F4HbmhgxTscshaQHklmiG
 4qRx2HtW9Zw8ufyLUFUYaRAj45eFDZMQStQMCNSECD8X+fS6CPGUqGFcuXrm+kLL
 v6k0cbvh1NOLSchqtfR4rochJFUp5veUNHoYQ7YRy3CqV1yrF7iM1e0G1rvyOQYQ
 9tpN93IYwLItRdUjtqyS/q8WOddRTo0LTqh5HDXPnLd3jc/kO7KjHv9dJna7wyhO
 MUJmLlpy1dRDHCvTl70oF0Nxe4Ve20n7U2QayF5bMGtCmQnzGL0=
 =4+6s
 -----END PGP SIGNATURE-----

Merge tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - extent map shrinker fixes:
     - fix potential use after free accessing an inode to reach fs_info,
       the shrinker could do iput() in the meantime
     - skip unnecessary scanning of inodes without extent maps
     - do direct iput(), no need for indirection via workqueue

 - in block < page mode, fix race when extending i_size in buffered mode

 - fix minor memory leak in selftests

 - print descriptive error message when seeding device is not found

* tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix data overwriting bug during buffered write when block size < page size
  btrfs: output an error message if btrfs failed to find the seed fsid
  btrfs: do regular iput instead of delayed iput during extent map shrinking
  btrfs: skip inodes without loaded extent maps when shrinking extent maps
  btrfs: fix use-after-free on inode when scanning root during em shrinking
  btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction
This commit is contained in:
Linus Torvalds 2025-02-25 09:42:15 -08:00
commit cc8a0934d0
4 changed files with 73 additions and 26 deletions

View File

@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
long nr_dropped = 0;
struct rb_node *node;
lockdep_assert_held_write(&tree->lock);
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
* We also do a try lock because otherwise we could deadlock. This is
* because the shrinker for this filesystem may be invoked while we are
* in a path that is holding the mmap lock in write mode. For example in
* a reflink operation while COWing an extent buffer, when allocating
* pages for a new extent buffer and under memory pressure, the shrinker
* may be invoked, and therefore we would deadlock by attempting to read
* lock the mmap lock while we are holding already a write lock on it.
* We also do a try lock because we don't want to block for too long and
* we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
/*
* We want to be fast so if the lock is busy we don't want to spend time
* waiting for it - either some task is about to do IO for the inode or
* we may have another task shrinking extent maps, here in this code, so
* skip this inode.
*/
if (!write_trylock(&tree->lock)) {
up_read(&inode->i_mmap_lock);
return 0;
}
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@ -1201,12 +1187,61 @@ next:
break;
node = next;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
u64 min_ino)
{
struct btrfs_inode *inode;
unsigned long from = min_ino;
xa_lock(&root->inodes);
while (true) {
struct extent_map_tree *tree;
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
if (!inode)
break;
tree = &inode->extent_tree;
/*
* We want to be fast so if the lock is busy we don't want to
* spend time waiting for it (some task is about to do IO for
* the inode).
*/
if (!write_trylock(&tree->lock))
goto next;
/*
* Skip inode if it doesn't have loaded extent maps, so we avoid
* getting a reference and doing an iput later. This includes
* cases like files that were opened for things like stat(2), or
* files with all extent maps previously released through the
* release folio callback (btrfs_release_folio()) or released in
* a previous run, or directories which never have extent maps.
*/
if (RB_EMPTY_ROOT(&tree->root)) {
write_unlock(&tree->lock);
goto next;
}
if (igrab(&inode->vfs_inode))
break;
write_unlock(&tree->lock);
next:
from = btrfs_ino(inode) + 1;
cond_resched_lock(&root->inodes.xa_lock);
}
xa_unlock(&root->inodes);
return inode;
}
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
long nr_dropped = 0;
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
iput(&inode->vfs_inode);
if (ctx->scanned >= ctx->nr_to_scan ||
btrfs_fs_closing(inode->root->fs_info))
if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
break;
cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {

View File

@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
u64 lockend;
size_t num_written = 0;
ssize_t ret;
loff_t old_isize = i_size_read(inode);
loff_t old_isize;
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret < 0)
return ret;
/*
* We can only trust the isize with inode lock held, or it can race with
* other buffered writes and cause incorrect call of
* pagecache_isize_extended() to overwrite existing data.
*/
old_isize = i_size_read(inode);
ret = generic_write_checks(iocb, i);
if (ret <= 0)
goto out;

View File

@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
if (!ret)
ret = select_delayed_refs_test(&trans);
kfree(transaction);
out_free_fs_info:
btrfs_free_dummy_fs_info(fs_info);
return ret;

View File

@ -7200,8 +7200,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
if (!btrfs_test_opt(fs_info, DEGRADED))
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_err(fs_info,
"failed to find fsid %pU when attempting to open seed devices",
fsid);
return ERR_PTR(-ENOENT);
}
fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))