for-6.14-rc4-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAme95g8ACgkQxWXV+ddt WDvi3g//V55iBXnPv0Jrs7b95GRskYv8A4vJsZhGtub4PlcEh8S6Q1IoU3qwiKHv E2THDA/A14qetxh3tSo73+RdS3JHpIH4QKjO54k74gOh45OEUs4Lq8NBAujmpz4b BMZZnM5iyZipNfbebUa/XxlPLvHg8D2rUqwycS/A0c5BE56HTvVzmKL3RdUfkAvA uZaJa6FOKfr6ge3ikl/dm+Rl7f+ZymIK4T9XsW3Lt223siYvcLJvWEIL0tk9B1y/ ZUQNqPOCHY0mX/zPC0425LoeH3LWDPyZPCakaY8tiwI20p/sP+hPLBC8WDrJvoam losu6v8EqkYK9zND/ETVq3d1Y9mzub/soKuM+aDQ/UM0JXz1vI3RYQcpskECR0Gf ZPq5tv+dSBbMmscvkxnkuNBaTr3IbOhkxaKwOvdoRN9F4HbmhgxTscshaQHklmiG 4qRx2HtW9Zw8ufyLUFUYaRAj45eFDZMQStQMCNSECD8X+fS6CPGUqGFcuXrm+kLL v6k0cbvh1NOLSchqtfR4rochJFUp5veUNHoYQ7YRy3CqV1yrF7iM1e0G1rvyOQYQ 9tpN93IYwLItRdUjtqyS/q8WOddRTo0LTqh5HDXPnLd3jc/kO7KjHv9dJna7wyhO MUJmLlpy1dRDHCvTl70oF0Nxe4Ve20n7U2QayF5bMGtCmQnzGL0= =4+6s -----END PGP SIGNATURE----- Merge tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: - extent map shrinker fixes: - fix potential use after free accessing an inode to reach fs_info, the shrinker could do iput() in the meantime - skip unnecessary scanning of inodes without extent maps - do direct iput(), no need for indirection via workqueue - in block < page mode, fix race when extending i_size in buffered mode - fix minor memory leak in selftests - print descriptive error message when seeding device is not found * tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix data overwriting bug during buffered write when block size < page size btrfs: output an error message if btrfs failed to find the seed fsid btrfs: do regular iput instead of delayed iput during extent map shrinking btrfs: skip inodes without loaded extent maps when shrinking extent maps btrfs: fix use-after-free on inode when scanning root during em shrinking btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction
This commit is contained in:
commit
cc8a0934d0
|
@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
|
|||
long nr_dropped = 0;
|
||||
struct rb_node *node;
|
||||
|
||||
lockdep_assert_held_write(&tree->lock);
|
||||
|
||||
/*
|
||||
* Take the mmap lock so that we serialize with the inode logging phase
|
||||
* of fsync because we may need to set the full sync flag on the inode,
|
||||
|
@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
|
|||
* to find new extents, which may not be there yet because ordered
|
||||
* extents haven't completed yet.
|
||||
*
|
||||
* We also do a try lock because otherwise we could deadlock. This is
|
||||
* because the shrinker for this filesystem may be invoked while we are
|
||||
* in a path that is holding the mmap lock in write mode. For example in
|
||||
* a reflink operation while COWing an extent buffer, when allocating
|
||||
* pages for a new extent buffer and under memory pressure, the shrinker
|
||||
* may be invoked, and therefore we would deadlock by attempting to read
|
||||
* lock the mmap lock while we are holding already a write lock on it.
|
||||
* We also do a try lock because we don't want to block for too long and
|
||||
* we are holding the extent map tree's lock in write mode.
|
||||
*/
|
||||
if (!down_read_trylock(&inode->i_mmap_lock))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We want to be fast so if the lock is busy we don't want to spend time
|
||||
* waiting for it - either some task is about to do IO for the inode or
|
||||
* we may have another task shrinking extent maps, here in this code, so
|
||||
* skip this inode.
|
||||
*/
|
||||
if (!write_trylock(&tree->lock)) {
|
||||
up_read(&inode->i_mmap_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
node = rb_first(&tree->root);
|
||||
while (node) {
|
||||
struct rb_node *next = rb_next(node);
|
||||
|
@ -1201,12 +1187,61 @@ next:
|
|||
break;
|
||||
node = next;
|
||||
}
|
||||
write_unlock(&tree->lock);
|
||||
up_read(&inode->i_mmap_lock);
|
||||
|
||||
return nr_dropped;
|
||||
}
|
||||
|
||||
static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
|
||||
u64 min_ino)
|
||||
{
|
||||
struct btrfs_inode *inode;
|
||||
unsigned long from = min_ino;
|
||||
|
||||
xa_lock(&root->inodes);
|
||||
while (true) {
|
||||
struct extent_map_tree *tree;
|
||||
|
||||
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
|
||||
if (!inode)
|
||||
break;
|
||||
|
||||
tree = &inode->extent_tree;
|
||||
|
||||
/*
|
||||
* We want to be fast so if the lock is busy we don't want to
|
||||
* spend time waiting for it (some task is about to do IO for
|
||||
* the inode).
|
||||
*/
|
||||
if (!write_trylock(&tree->lock))
|
||||
goto next;
|
||||
|
||||
/*
|
||||
* Skip inode if it doesn't have loaded extent maps, so we avoid
|
||||
* getting a reference and doing an iput later. This includes
|
||||
* cases like files that were opened for things like stat(2), or
|
||||
* files with all extent maps previously released through the
|
||||
* release folio callback (btrfs_release_folio()) or released in
|
||||
* a previous run, or directories which never have extent maps.
|
||||
*/
|
||||
if (RB_EMPTY_ROOT(&tree->root)) {
|
||||
write_unlock(&tree->lock);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (igrab(&inode->vfs_inode))
|
||||
break;
|
||||
|
||||
write_unlock(&tree->lock);
|
||||
next:
|
||||
from = btrfs_ino(inode) + 1;
|
||||
cond_resched_lock(&root->inodes.xa_lock);
|
||||
}
|
||||
xa_unlock(&root->inodes);
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
|
@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
|
|||
long nr_dropped = 0;
|
||||
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
|
||||
|
||||
inode = btrfs_find_first_inode(root, min_ino);
|
||||
inode = find_first_inode_to_shrink(root, min_ino);
|
||||
while (inode) {
|
||||
nr_dropped += btrfs_scan_inode(inode, ctx);
|
||||
write_unlock(&inode->extent_tree.lock);
|
||||
|
||||
min_ino = btrfs_ino(inode) + 1;
|
||||
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
|
||||
btrfs_add_delayed_iput(inode);
|
||||
iput(&inode->vfs_inode);
|
||||
|
||||
if (ctx->scanned >= ctx->nr_to_scan ||
|
||||
btrfs_fs_closing(inode->root->fs_info))
|
||||
if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
|
||||
inode = btrfs_find_first_inode(root, min_ino);
|
||||
inode = find_first_inode_to_shrink(root, min_ino);
|
||||
}
|
||||
|
||||
if (inode) {
|
||||
|
|
|
@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
|
|||
u64 lockend;
|
||||
size_t num_written = 0;
|
||||
ssize_t ret;
|
||||
loff_t old_isize = i_size_read(inode);
|
||||
loff_t old_isize;
|
||||
unsigned int ilock_flags = 0;
|
||||
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
|
||||
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
|
||||
|
@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
|
|||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* We can only trust the isize with inode lock held, or it can race with
|
||||
* other buffered writes and cause incorrect call of
|
||||
* pagecache_isize_extended() to overwrite existing data.
|
||||
*/
|
||||
old_isize = i_size_read(inode);
|
||||
|
||||
ret = generic_write_checks(iocb, i);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
|
|
@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
|
|||
if (!ret)
|
||||
ret = select_delayed_refs_test(&trans);
|
||||
|
||||
kfree(transaction);
|
||||
out_free_fs_info:
|
||||
btrfs_free_dummy_fs_info(fs_info);
|
||||
return ret;
|
||||
|
|
|
@ -7200,8 +7200,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
|
|||
|
||||
fs_devices = find_fsid(fsid, NULL);
|
||||
if (!fs_devices) {
|
||||
if (!btrfs_test_opt(fs_info, DEGRADED))
|
||||
if (!btrfs_test_opt(fs_info, DEGRADED)) {
|
||||
btrfs_err(fs_info,
|
||||
"failed to find fsid %pU when attempting to open seed devices",
|
||||
fsid);
|
||||
return ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
fs_devices = alloc_fs_devices(fsid);
|
||||
if (IS_ERR(fs_devices))
|
||||
|
|
Loading…
Reference in New Issue