aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/bio.c33
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h19
-rw-r--r--fs/btrfs/disk-io.c9
-rw-r--r--fs/btrfs/extent-tree.c8
-rw-r--r--fs/btrfs/file.c1
-rw-r--r--fs/btrfs/inode.c19
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/locking.h10
-rw-r--r--fs/btrfs/ref-verify.c1
-rw-r--r--fs/btrfs/super.c66
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tree-checker.c27
13 files changed, 155 insertions, 58 deletions
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 1f216d07eff6..7ea6f0b43b95 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -81,6 +81,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
&btrfs_clone_bioset);
+ if (IS_ERR(bio))
+ return ERR_CAST(bio);
+
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -355,7 +358,7 @@ static void btrfs_simple_end_io(struct bio *bio)
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
@@ -398,7 +401,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
@@ -412,7 +415,7 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
- } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ } else if (bio_is_zone_append(bio)) {
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
@@ -649,8 +652,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
map_length = min(map_length, bbio->fs_info->max_zone_append_size);
sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
&nr_segs, map_length);
- if (sector_offset)
- return sector_offset << SECTOR_SHIFT;
+ if (sector_offset) {
+ /*
+ * bio_split_rw_at() could split at a size smaller than our
+ * sectorsize and thus cause unaligned I/Os. Fix that by
+ * always rounding down to the nearest boundary.
+ */
+ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
+ }
return map_length;
}
@@ -678,7 +687,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
&bioc, &smap, &mirror_num);
if (error) {
ret = errno_to_blk_status(error);
- goto fail;
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
}
map_length = min(map_length, length);
@@ -686,7 +696,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length);
+ struct btrfs_bio *split;
+
+ split = btrfs_split_bio(fs_info, bbio, map_length);
+ if (IS_ERR(split)) {
+ ret = errno_to_blk_status(PTR_ERR(split));
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
+ }
+ bbio = split;
bio = &bbio->bio;
}
@@ -760,6 +778,7 @@ fail:
btrfs_bio_end_io(remaining, ret);
}
+end_bbio:
btrfs_bio_end_io(bbio, ret);
/* Do not submit another chunk */
return true;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 148648ea1c8b..693dc27ffb89 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2046,7 +2046,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -2059,6 +2059,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int min_write_lock_level;
int prev_cmp;
+ if (!root)
+ return -EINVAL;
+
+ fs_info = root->fs_info;
might_sleep();
lowest_level = p->lowest_level;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 307dedf95c70..2c341956a01c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -371,6 +371,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi
}
/*
+ * Return the generation this root started with.
+ *
+ * Every normal root that is created with root->root_key.offset set to it's
+ * originating generation. If it is a snapshot it is the generation when the
+ * snapshot was created.
+ *
+ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
+ * tree root. Thankfully we copy the root item of the owning tree root, which
+ * has it's last_snapshot set to what we would have root_key.offset set to, so
+ * return that if this is a TREE_RELOC root.
+ */
+static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
+{
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return btrfs_root_last_snapshot(&root->root_item);
+ return root->root_key.offset;
+}
+
+/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
*/
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 814320948645..eff0dd1ae62f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4262,6 +4262,15 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
* already the cleaner, but below we run all pending delayed iputs.
*/
btrfs_flush_workqueue(fs_info->fixup_workers);
+ /*
+ * Similar case here, we have to wait for delalloc workers before we
+ * proceed below and stop the cleaner kthread, otherwise we trigger a
+ * use-after-tree on the cleaner kthread task_struct when a delalloc
+ * worker running submit_compressed_extents() adds a delayed iput, which
+ * does a wake up on the cleaner kthread, which was already freed below
+ * when we call kthread_stop().
+ */
+ btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
* After we parked the cleaner kthread, ordered extents may have
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 412e318e4a22..3c6f7fecbb9a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2422,7 +2422,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
goto out;
ret = check_delayed_ref(root, path, objectid, offset, bytenr);
- } while (ret == -EAGAIN);
+ } while (ret == -EAGAIN && !path->nowait);
out:
btrfs_release_path(path);
@@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *
* reference to it.
*/
generation = btrfs_node_ptr_generation(eb, slot);
- if (!wc->update_ref || generation <= root->root_key.offset)
+ if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
return false;
/*
@@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
goto reada;
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset)
+ generation <= btrfs_root_origin_generation(root))
continue;
/* We don't lock the tree block, it's OK to be racy here */
@@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset) {
+ generation <= btrfs_root_origin_generation(root)) {
wc->lookup_info = 1;
return 1;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 588c353d2969..14e27473c5bc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -911,6 +911,7 @@ again:
ret = PTR_ERR(folio);
return ret;
}
+ folio_wait_writeback(folio);
/* Only support page sized folio yet. */
ASSERT(folio_order(folio) == 0);
ret = set_folio_extent_mapped(folio);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03fe0de2cd0d..488edca8333a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3063,6 +3063,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ /*
+ * If it's a COW write we need to lock the extent range as we will be
+ * inserting/replacing file extent items and unpinning an extent map.
+ * This must be taken before joining a transaction, as it's a higher
+ * level lock (like the inode's VFS lock), otherwise we can run into an
+ * ABBA deadlock with other tasks (transactions work like a lock,
+ * depending on their current state).
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
+ }
+
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3099,9 +3112,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -9089,7 +9099,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (atomic_dec_return(&priv->pending) == 0) {
+ if (atomic_dec_and_test(&priv->pending)) {
int err = blk_status_to_errno(READ_ONCE(priv->status));
if (priv->uring_ctx) {
@@ -9866,6 +9876,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (btrfs_root_dead(root)) {
spin_unlock(&root->root_item_lock);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c9302d193187..3af8bb0c8d75 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4751,6 +4751,9 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
size_t page_offset;
ssize_t ret;
+ /* The inode lock has already been acquired in btrfs_uring_read_extent. */
+ btrfs_lockdep_inode_acquire(inode, i_rwsem);
+
if (priv->err) {
ret = priv->err;
goto out;
@@ -4859,6 +4862,13 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
* and inode and freeing the allocations.
*/
+ /*
+ * We're returning to userspace with the inode lock held, and that's
+ * okay - it'll get unlocked in a worker thread. Call
+ * btrfs_lockdep_inode_release() to avoid confusing lockdep.
+ */
+ btrfs_lockdep_inode_release(inode, i_rwsem);
+
return -EIOCBQUEUED;
out_fail:
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 46c8be2afab1..35036b151bf5 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -129,6 +129,16 @@ enum btrfs_lockdep_trans_states {
rwsem_release(&owner->lock##_map, _THIS_IP_)
/*
+ * Used to account for the fact that when doing io_uring encoded I/O, we can
+ * return to userspace with the inode lock still held.
+ */
+#define btrfs_lockdep_inode_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_)
+
+#define btrfs_lockdep_inode_release(owner, lock) \
+ rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_)
+
+/*
* Macros for the transaction states wait events, similar to the generic wait
* event macros.
*/
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 9522a8b79d22..2928abf7eb82 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -857,6 +857,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ rb_erase(&ref->node, &be->refs);
kfree(ref);
kfree(ra);
goto out_unlock;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 97a85d180b61..7dfe5005129a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1885,18 +1885,21 @@ static int btrfs_get_tree_super(struct fs_context *fc)
if (sb->s_root) {
btrfs_close_devices(fs_devices);
- if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
- ret = -EBUSY;
+ /*
+ * At this stage we may have RO flag mismatch between
+ * fc->sb_flags and sb->s_flags. Caller should detect such
+ * mismatch and reconfigure with sb->s_umount rwsem held if
+ * needed.
+ */
} else {
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices);
- }
-
- if (ret) {
- deactivate_locked_super(sb);
- return ret;
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
}
btrfs_clear_oneshot_options(fs_info);
@@ -1982,39 +1985,18 @@ error:
* btrfs or not, setting the whole super block RO. To make per-subvolume mounting
* work with different options work we need to keep backward compatibility.
*/
-static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt)
{
- struct vfsmount *mnt;
- int ret;
- const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
-
- /*
- * We got an EBUSY because our SB_RDONLY flag didn't match the existing
- * super block, so invert our setting here and retry the mount so we
- * can get our vfsmount.
- */
- if (ro2rw)
- fc->sb_flags |= SB_RDONLY;
- else
- fc->sb_flags &= ~SB_RDONLY;
-
- mnt = fc_mount(fc);
- if (IS_ERR(mnt))
- return mnt;
+ int ret = 0;
- if (!ro2rw)
- return mnt;
+ if (fc->sb_flags & SB_RDONLY)
+ return ret;
- /* We need to convert to rw, call reconfigure. */
- fc->sb_flags &= ~SB_RDONLY;
down_write(&mnt->mnt_sb->s_umount);
- ret = btrfs_reconfigure(fc);
+ if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY))
+ ret = btrfs_reconfigure(fc);
up_write(&mnt->mnt_sb->s_umount);
- if (ret) {
- mntput(mnt);
- return ERR_PTR(ret);
- }
- return mnt;
+ return ret;
}
static int btrfs_get_tree_subvol(struct fs_context *fc)
@@ -2024,6 +2006,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
struct fs_context *dup_fc;
struct dentry *dentry;
struct vfsmount *mnt;
+ int ret = 0;
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -2066,11 +2049,16 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fc->security = NULL;
mnt = fc_mount(dup_fc);
- if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
- mnt = btrfs_reconfigure_for_mount(dup_fc);
- put_fs_context(dup_fc);
- if (IS_ERR(mnt))
+ if (IS_ERR(mnt)) {
+ put_fs_context(dup_fc);
return PTR_ERR(mnt);
+ }
+ ret = btrfs_reconfigure_for_mount(dup_fc, mnt);
+ put_fs_context(dup_fc);
+ if (ret) {
+ mntput(mnt);
+ return ret;
+ }
/*
* This free's ->subvol_name, because if it isn't set we have to
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b843308e2bc6..fdcbf650ac31 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -295,7 +295,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
/* Remove once support for raid stripe tree is feature complete. */
@@ -329,7 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 148d8cefa40e..dfeee033f31f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf,
dref_offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid data ref count, should have non-zero value");
+ return -EUCLEAN;
+ }
inline_refs += btrfs_extent_data_ref_count(leaf, dref);
break;
/* Contains parent bytenr and ref count */
@@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid shared data ref count, should have non-zero value");
+ return -EUCLEAN;
+ }
inline_refs += btrfs_shared_data_ref_count(leaf, sref);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
@@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
{
u32 expect_item_size = 0;
- if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ if (key->type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_shared_data_ref *sref;
+
+ sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref);
+ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid shared data backref count, should have non-zero value");
+ return -EUCLEAN;
+ }
+
expect_item_size = sizeof(struct btrfs_shared_data_ref);
+ }
if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
@@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
offset, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
+ extent_err(leaf, slot,
+ "invalid extent data backref count, should have non-zero value");
+ return -EUCLEAN;
+ }
}
return 0;
}