diff options
Diffstat (limited to 'fs/erofs/zdata.c')
-rw-r--r-- | fs/erofs/zdata.c | 221 |
1 files changed, 167 insertions, 54 deletions
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a569ff9dfd04..01f147505487 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { - struct erofs_workgroup obj; struct mutex lock; + struct lockref lockref; /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; + /* I: start block address of this pcluster */ + erofs_off_t index; + /* L: the maximum decompression size of this round */ unsigned int length; @@ -108,7 +111,7 @@ struct z_erofs_decompressqueue { static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) { - return !pcl->obj.index; + return !pcl->index; } static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) @@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) { return fo->mapping == MNGD_MAPPING(sbi); @@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, pcl->obj.index + i); + page = find_get_page(mc, pcl->index + i); if (!page) { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); } - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (!pcl->compressed_bvecs[i].page) { pcl->compressed_bvecs[i].page = page ? page : newpage; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); continue; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (page) put_page(page); @@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) } /* (erofs_shrinker) disconnect cached encoded data with pclusters */ -int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct folio *folio; int i; @@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) return true; ret = false; - spin_lock(&pcl->obj.lockref.lock); - if (pcl->obj.lockref.count <= 0) { + spin_lock(&pcl->lockref.lock); + if (pcl->lockref.count <= 0) { DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { @@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) } } } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return ret; } @@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, if (exclusive) { /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); while (fe->icur > 0) { if (pcl->compressed_bvecs[--fe->icur].page) continue; pcl->compressed_bvecs[fe->icur] = *bvec; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return 0; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -710,13 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } +static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) +{ + if (lockref_get_not_zero(&pcl->lockref)) + return true; + + spin_lock(&pcl->lockref.lock); + if (__lockref_is_dead(&pcl->lockref)) { + spin_unlock(&pcl->lockref.lock); + return false; + } + + if (!pcl->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&pcl->lockref.lock); + return true; +} + static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); bool ztailpacking = map->m_flags & EROFS_MAP_META; - struct z_erofs_pcluster *pcl; - struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl, *pre; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || @@ -730,8 +747,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->obj.lockref.lock); - pcl->obj.lockref.count = 1; /* one ref for this request */ + spin_lock_init(&pcl->lockref.lock); + pcl->lockref.count = 1; /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -749,19 +766,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { - pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->index = 0; /* which indicates ztailpacking */ } else { - pcl->obj.index = erofs_blknr(sb, map->m_pa); - - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; + pcl->index = erofs_blknr(sb, map->m_pa); + while (1) { + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + NULL, pcl, GFP_KERNEL); + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { + xa_unlock(&sbi->managed_pslots); + break; + } + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); } - - if (grp != &pcl->obj) { - fe->pcl = container_of(grp, - struct z_erofs_pcluster, obj); + if (xa_is_err(pre)) { + err = xa_err(pre); + goto err_out; + } else if (pre) { + fe->pcl = pre; err = -EEXIST; goto err_out; } @@ -781,7 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); - struct erofs_workgroup *grp = NULL; + struct z_erofs_pcluster *pcl = NULL; int ret; DBG_BUGON(fe->pcl); @@ -789,14 +813,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(sb, blknr); + while (1) { + rcu_read_lock(); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + if (!pcl || z_erofs_get_pcluster(pcl)) { + DBG_BUGON(pcl && blknr != pcl->index); + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + } } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } - if (grp) { - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (pcl) { + fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); @@ -851,12 +884,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head) struct z_erofs_pcluster, rcu)); } -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + if (pcl->lockref.count) + return false; - call_rcu(&pcl->rcu, z_erofs_rcu_callback); + /* + * Note that all cached folios should be detached before deleted from + * the XArray. Otherwise some folios could be still attached to the + * orphan old pcluster when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) + return false; + + /* + * It's impossible to fail after the pcluster is freezed, but in order + * to avoid some race conditions, add a DBG_BUGON to observe this. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + + lockref_mark_dead(&pcl->lockref); + return true; +} + +static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + bool free; + + spin_lock(&pcl->lockref.lock); + free = __erofs_try_to_release_pcluster(sbi, pcl); + spin_unlock(&pcl->lockref.lock); + if (free) { + atomic_long_dec(&erofs_global_shrink_cnt); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); + } + return free; +} + +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink) +{ + struct z_erofs_pcluster *pcl; + unsigned int freed = 0; + unsigned long index; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, pcl) { + /* try to shrink each valid pcluster */ + if (!erofs_try_to_release_pcluster(sbi, pcl)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr_shrink) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, bool try_free) +{ + bool free = false; + + if (lockref_put_or_lock(&pcl->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); + if (!--pcl->lockref.count) { + if (try_free && xa_trylock(&sbi->managed_pslots)) { + free = __erofs_try_to_release_pcluster(sbi, pcl); + xa_unlock(&sbi->managed_pslots); + } + atomic_long_add(!free, &erofs_global_shrink_cnt); + } + spin_unlock(&pcl->lockref.lock); + if (free) + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) @@ -877,7 +985,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) - erofs_workgroup_put(&pcl->obj); + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; } @@ -1179,6 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, int i, j, jtop, err2; struct page *page; bool overlapped; + bool try_free = true; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; @@ -1236,9 +1345,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { page = be->compressed_pages[i]; - if (!page || - erofs_folio_is_managed(sbi, page_folio(page))) + if (!page) + continue; + if (erofs_folio_is_managed(sbi, page_folio(page))) { + try_free = false; continue; + } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1284,6 +1396,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); mutex_unlock(&pcl->lock); + + if (z_erofs_is_inline_pcluster(pcl)) + z_erofs_free_pcluster(pcl); + else + z_erofs_put_pcluster(sbi, pcl, try_free); return err; } @@ -1306,10 +1423,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); err = z_erofs_decompress_pcluster(&be, err) ?: err; - if (z_erofs_is_inline_pcluster(be.pcl)) - z_erofs_free_pcluster(be.pcl); - else - erofs_workgroup_put(&be.pcl->obj); } return err; } @@ -1391,9 +1504,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); zbv = pcl->compressed_bvecs[nr]; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (!zbv.page) goto out_allocfolio; @@ -1455,23 +1568,23 @@ repeat: folio_put(folio); out_allocfolio: page = __erofs_allocpage(&f->pagepool, gfp, true); - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { if (page) erofs_pagepool_add(&f->pagepool, page); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); bvec->bv_page = page; if (!page) return; folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1603,7 +1716,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->index), }; (void)erofs_map_dev(sb, &mdev); |